0% found this document useful (0 votes)
8 views

Coding Notes Data Science

All coding related to data science

Uploaded by

Saman
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
8 views

Coding Notes Data Science

All coding related to data science

Uploaded by

Saman
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 4

Coding Notes Data Science

pd.read_csv
import pandas as pd

# reading csv file


df = pd.read_csv("people.csv")
df
pd.read_excel
pip install pandas
pip install xlrd
import pandas as pd
df = pd.read_excel("sample.xlsx")
print(df)
pd.read_sql
# import the modules
import pandas as pd
from sqlalchemy import create_engine

# SQLAlchemy connectable
cnx = create_engine('sqlite:///contacts.db ').connect()

# table named 'contacts' will be returned as a dataframe.


df = pd.read_sql_table('contacts', cnx)
print(df)
pd.read_table
# importing pandas
import pandas as pd

pd.read_table('people.csv', delimiter=',')
Clean a real world messy dataset (eg: Kaggle)
# modules we'll use
import pandas as pd
import numpy as np

# read in all our data


nfl_data = pd.read_csv("../input/nflplaybyplay2009to2016/NFL Play by Play 2009-2017 (v4).csv")

# set seed for reproducibility


np.random.seed(0)
# look at the first five rows of the nfl_data file.
# I can see a handful of missing data already!
nfl_data.head()
# get the number of missing data points per column
missing_values_count = nfl_data.isnull().sum()

# look at the # of missing points in the first ten columns


missing_values_count[0:10]
# how many total missing values do we have?
total_cells = np.product(nfl_data.shape)
total_missing = missing_values_count.sum()

# percent of data that is missing


percent_missing = (total_missing/total_cells) * 100
print(percent_missing)
# look at the # of missing points in the first ten columns
missing_values_count[0:10]
# remove all the rows that contain a missing value
nfl_data.dropna()
# remove all columns with at least one missing value
columns_with_na_dropped = nfl_data.dropna(axis=1)
columns_with_na_dropped.head()
# just how much data did we lose?
print("Columns in original dataset: %d \n" % nfl_data.shape[1])
print("Columns with na's dropped: %d" % columns_with_na_dropped.shape[1])
# get a small subset of the NFL dataset
subset_nfl_data = nfl_data.loc[:, 'EPA':'Season'].head()
subset_nfl_data
# replace all NA's with 0
subset_nfl_data.fillna(0)
# replace all NA's the value that comes directly after it in the same column,
# then replace all the remaining na's with 0
subset_nfl_data.fillna(method='bfill', axis=0).fillna(0)
Apply EDA on a student performance dataset
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in

import numpy as np # linear algebra


import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv('/kaggle/input/students-performance-in-exams/
StudentsPerformance.csv')
df.info()
df.describe()
df.shape
df.isnull().sum() #checks if there are any missing values
Lets start with plotting graphs
plt.rcParams['figure.figsize'] = (20, 10)
sns.countplot(df['math score'], palette = 'dark')
plt.title('Math Score',fontsize = 20)
plt.show()
To analyse the data in more deeper way, lets few new columns: Total marks, Percentage and Grades.

df['total marks']=df['math score']+df['reading score']+df['writing score']


df['percentage']=df['total marks']/300*100
#Assigning the grades

def determine_grade(scores):
if scores >= 85 and scores <= 100:
return 'Grade A'
elif scores >= 70 and scores < 85:
return 'Grade B'
elif scores >= 55 and scores < 70:
return 'Grade C'
elif scores >= 35 and scores < 55:
return 'Grade D'
elif scores >= 0 and scores < 35:
return 'Grade E'

df['grades']=df['percentage'].apply(determine_grade)
df.info()
df['grades'].value_counts().plot.pie(autopct="%1.1f%%")
plt.show()
Implementation of Linear Regression Model
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Sample data (replace with your actual data)


X = np.array([[1], [2], [3], [4], [5]]) # Independent variable (features)
y = np.array([2, 4, 5, 4, 5]) # Dependent variable (target)

# Split data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model


model = LinearRegression()

# Train the model using the training data


model.fit(X_train, y_train)

# Make predictions on the test data


y_pred = model.predict(X_test)

# Evaluate the model


mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Print the coefficients


print(f"Intercept: {model.intercept_}")
print(f"Coefficient: {model.coef_}")

Implementation of Random Forest Model


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Load the data (replace 'your_data.csv' with your actual file)


data = pd.read_csv('your_data.csv')

# Separate features (X) and target (y)


X = data.drop('target_column_name', axis=1) # Replace 'target_column_name'
y = data['target_column_name']

# Split data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier model


model = RandomForestClassifier(n_estimators=100, random_state=42)
# n_estimators is the number of trees in the forest

# Train the model


model.fit(X_train, y_train)

# Make predictions on the test set


y_pred = model.predict(X_test)

# Evaluate the model


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy