Code for Data Cleaning with Pandas in Python Tutorial


View on Github

data_cleaning.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

print(data_frames.head(10))

data_cleaning2.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

print(data_frames.info())

data_cleaning3.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')

# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')

print(data_frames.head(10))
print(data_frames.info())

data_cleaning4.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')

# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')


# Handle Data Inconsistencies
# Capitalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()

def capitalize_words(arr):
    for index, word in enumerate(arr):
        if index == 0:
            pass
        else:
            arr[index] = word.capitalize()

data_frames['street_address'].apply(lambda x: capitalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')

print(data_frames['street_address'])

data_cleaning5.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')

# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')


# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()

def normalize_words(arr):
    for index, word in enumerate(arr):
        if index == 0:
            pass
        else:
            arr[index] = normalize(word)

def normalize(word):
    if word.lower() == 'st':
        word = 'street'
    elif word.lower() == 'rd':
        word = 'road'

    return word.capitalize()


data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')

print(data_frames.head(10))

data_cleaning6.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')

# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')


# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()

def normalize_words(arr):
    for index, word in enumerate(arr):
        if index == 0:
            pass
        else:
            arr[index] = normalize(word)

def normalize(word):
    if word.lower() == 'st':
        word = 'street'
    elif word.lower() == 'rd':
        word = 'road'

    return word.capitalize()


data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')


# Remove Out-of-Range Data
# create boolean Series for out of range donations 
out_of_range = data_frames['donation'] < 0

# keep only the rows that are NOT out of range
data_frames['donation'] = data_frames['donation'][~out_of_range]

print(data_frames.head(10))

data_cleaning7.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')

# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')


# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()

def normalize_words(arr):
    for index, word in enumerate(arr):
        if index == 0:
            pass
        else:
            arr[index] = normalize(word)

def normalize(word):
    if word.lower() == 'st':
        word = 'street'
    elif word.lower() == 'rd':
        word = 'road'

    return word.capitalize()


data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')


# Remove Out-of-Range Data
# create boolean Series for out of range donations 
out_of_range = data_frames['donation'] < 0

# keep only the rows that are NOT out of range
data_frames['donation'] = data_frames['donation'][~out_of_range]


# Remove duplicates
columns_to_check = ['first_name', 'last_name', 'street_address', 'city', 'state']
data_frames_no_dupes = data_frames.drop_duplicates(subset=columns_to_check, keep='first')

print(data_frames_no_dupes.info())

data_cleaning8.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')

# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')


# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()

def normalize_words(arr):
    for index, word in enumerate(arr):
        if index == 0:
            pass
        else:
            arr[index] = normalize(word)

def normalize(word):
    if word.lower() == 'st':
        word = 'street'
    elif word.lower() == 'rd':
        word = 'road'

    return word.capitalize()


data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')


# Remove Out-of-Range Data
# create boolean Series for out of range donations 
out_of_range = data_frames['donation'] < 0

# keep only the rows that are NOT out of range
data_frames['donation'] = data_frames['donation'][~out_of_range]


# Remove duplicates
columns_to_check = ['first_name', 'last_name', 'street_address', 'city', 'state']
data_frames_no_dupes = data_frames.drop_duplicates(subset=columns_to_check, keep='first')


# Drop Missing Data
columns_to_check = ['state', 'donation']
data_frames_no_missing = data_frames_no_dupes.dropna(subset=columns_to_check)


print(data_frames_no_missing.head(20))

data_cleaning9.py

import pandas as pd

# Config settings
pd.set_option('max_columns', None)
pd.set_option('max_rows', 12)

# Import CSV data
data_frames = pd.read_csv (r'simulated_data.csv')

# Data Type Conversion
# Remove '$' from donation strings
data_frames['donation'] = data_frames['donation'].str.strip('$')

# Convert donation stings into numerical data type
data_frames['donation'] = data_frames['donation'].astype('float64')


# Handle Data Inconsistencies
# Normalize strings
data_frames['street_address'] = data_frames['street_address'].str.split()

def normalize_words(arr):
    for index, word in enumerate(arr):
        if index == 0:
            pass
        else:
            arr[index] = normalize(word)

def normalize(word):
    if word.lower() == 'st':
        word = 'street'
    elif word.lower() == 'rd':
        word = 'road'

    return word.capitalize()


data_frames['street_address'].apply(lambda x: normalize_words(x))
data_frames['street_address'] = data_frames['street_address'].str.join(' ')


# Remove Out-of-Range Data
# create boolean Series for out of range donations 
out_of_range = data_frames['donation'] < 0

# keep only the rows that are NOT out of range
data_frames['donation'] = data_frames['donation'][~out_of_range]


# Remove duplicates
columns_to_check = ['first_name', 'last_name', 'street_address', 'city', 'state']
data_frames_no_dupes = data_frames.drop_duplicates(subset=columns_to_check, keep='first')


# Drop Missing Data
columns_to_check = ['state', 'donation']
data_frames_no_missing = data_frames_no_dupes.dropna(subset=columns_to_check)


print(data_frames_no_missing.head(20))


data_frames_no_missing.to_csv(r'clean_donations_data.csv', index = False)

helpers.py

import random

def add_donations(rows):
    total_donations = len(rows)
    donations = []

    # create list of random donation values 
    donations = list_of_donations(total_donations)

    # add donations onto main records
    count = 0
    while count < total_donations:
        rows[count].append(donations[count])
        count += 1


def create_row_base():
    first_name_options = ['Rosemaria', 'Jodi', 'Alvy', 'Blake', 'Ellis', '']
    last_name_options = ['Roderick', 'Hesbrook', 'Summerton', 'Rappport', 'Alben', '']
    city_options = ['Hialeah', 'Arlington', 'Springfield', 'Carrollton', 'Cambridge', '']
    state_options = ['CT', 'NY', 'VA', 'WA', 'AZ', '']

    first_name = random.choice(first_name_options)
    last_name = random.choice(last_name_options)
    street =  street_address()
    city = random.choice(city_options)
    state = random.choice(state_options)
    
    return [
        first_name, 
        last_name, 
        street, 
        city, 
        state
    ]


def list_of_donations(size):
    donations = []

    donation_amt = random_dollar_amt()
    for i in range(size):
        # randomly change donation value
        if random.choice([1, 2, 3, 4, 5]) > 1:
            donation_amt = random_dollar_amt()
        donations.append(donation_amt)

    return donations


def random_dollar_amt():
    dollars = random.randint(-50, 200)
    cents = random.randint(0, 99)
    return '${}.{}'.format(dollars, cents)


def scramble_capitalization(str):
    final_str = ''
    for letter in str:
        final_str += random.choice([letter.upper(), letter.lower()])
    return final_str


def street_address():
    num = random.randint(40,1001)
    road_name = random.choice(['Western Plank', 'Forest Run', 'Kings', 'Oaktree'])
    road_type = random.choice(['Street', 'St', 'Road', 'Rd', ''])

    address = '{} {} {}'.format(num, road_name, road_type)   
    return address

simulator.py

import csv
import random
import helpers

def generate_dataset():
    rows = []
    count = 0

    # generate list of base records: names data + address data
    while count < 20:
        row_to_add = helpers.create_row_base()
        rows.append(row_to_add)

        # randomly add duplicate records 
        if random.choice([1, 2, 3, 4, 5]) > 2:
            rows.append(row_to_add.copy())
            # scramble formatting of street address
            rows[-1][2] = helpers.scramble_capitalization(rows[-1][2])
        count += 1

    # add donation amounts to each record
    helpers.add_donations(rows)

    return rows


with open('simulated_data.csv', 'w') as f:
    f_csv = csv.writer(f)
    
    # write headers first
    f_csv.writerow(['first_name','last_name','street_address',
        'city','state', 'donation'])
    f_csv.writerows(generate_dataset())


pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy