Skip to content

Commit e673318

Browse files
authored
Add files via upload
1 parent 5e56ac2 commit e673318

File tree

6 files changed

+664
-0
lines changed

6 files changed

+664
-0
lines changed

BeautifulSoup4_sample.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import csv
4+
5+
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
6+
print(page.status_code)
7+
soup = BeautifulSoup(page.text, 'html.parser')
8+
link_set = set()
9+
for link in soup.find_all('a'):
10+
web_links = link.get("href")
11+
print(web_links)
12+
link_set.add(web_links)
13+
14+
csvfile = open('code_python.csv', 'w+', newline='')
15+
writer = csv.writer(csvfile)
16+
writer.writerow(['Links'])
17+
for link in link_set:
18+
writer.writerow([link])
19+
csvfile.close()

OlxRecommenderSystem.py

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
OLX Challenge 2: Recommender System
4+
"""
5+
6+
# # Finding Similar Movies
7+
# We'll start by loading up the MovieLens dataset. Using Pandas, we can very quickly load the rows of the u.data and u.item files that we care about, and merge them together so we can work with movie names instead of ID's. (In a real production job, you'd stick with ID's and worry about the names at the display layer to make things more efficient. But this lets us understand what's going on better for now.)
8+
9+
import pandas as pd
10+
import numpy as np
11+
import matplotlib.pyplot as plt
12+
#import re
13+
#from datetime import datetime
14+
15+
r_cols = ['user_id', 'movie_id', 'rating']
16+
ratings = pd.read_csv('C:/Users/xxx/DataScience-Python3_/ml-100k/u.data', sep='\t', names=r_cols, usecols=range(3), encoding="ISO-8859-1")
17+
18+
m_cols = ['movie_id', 'title', 'date']
19+
movies = pd.read_csv('C:/Users/xxx/DataScience-Python3_/ml-100k/u.item', sep='|', names=m_cols, usecols=range(3), encoding="ISO-8859-1")
20+
21+
ratings = pd.merge(movies, ratings)
22+
ratings.head()
23+
24+
25+
# Data exploration
26+
27+
plt.hist(ratings['rating'])
28+
ratings.groupby(['rating'])['user_id'].count()
29+
# Most movies have 4 stars
30+
31+
plt.hist(ratings.groupby(['movie_id'])['movie_id'].count())
32+
# The distribution of movie views shows that the starting movie_ids have more ratings then later movies.
33+
34+
plt.hist(ratings.groupby(['user_id'])['user_id'].count())
35+
#we see that a small number of users has rated the most movies [following approx a power law model imo]
36+
37+
# MOD
38+
#filter movies newer >= 1970
39+
#ratings = ratings[ratings['title'].str.contains('19[789]\d', regex=True)]
40+
##ratings = ratings[datetime.strptime(ratings['date'], '%Y').date() >= datetime.strptime('1977', '%Y').date()]
41+
42+
# Rating matrix representation
43+
# The pivot_table function on a DataFrame will construct a user / movie rating matrix.
44+
# Note that NaN indicates missing data - movies that specific users didn't rate.
45+
movieRatings = ratings.pivot_table(index=['user_id'],columns=['title'],values='rating')
46+
movieRatings.head()
47+
48+
# this does the same but creates a float matrix instead of a dataframe
49+
n_users = ratings['user_id'].unique().shape[0]
50+
print(str(n_users) + ' users')
51+
n_items = ratings['movie_id'].unique().shape[0]
52+
print(str(n_items) + ' movies')
53+
movieRatingsZ = np.zeros((n_users, n_items))
54+
for row in ratings.itertuples():
55+
movieRatingsZ[row[1]-1, row[4]-1] = row[5] #1:user_id, 4:movie_id, 5:rating
56+
57+
movieRatings.shape
58+
movieRatingsZ.shape # has more columns?
59+
60+
# Let's determine the sparsity in the data
61+
sparsity = float(len(movieRatingsZ.nonzero()[0])) # 0 is here not rated / missing.
62+
sparsity /= (movieRatingsZ.shape[0] * movieRatingsZ.shape[1])
63+
sparsity *= 100
64+
print('Sparsity: {:4.2f}%'.format(sparsity))
65+
#should be 6.30% - mine is 5.71% - that must be the reult of the initial merge of the two datasets..
66+
67+
# Item Based CF
68+
# Let's extract a Series of users who rated Star Wars:
69+
starWarsRatings = movieRatings['Star Wars (1977)']
70+
starWarsRatings.head()
71+
72+
# Pandas' corrwith function makes it really easy to compute the pairwise correlation of Star Wars' vector of user rating with every other movie! After that, we'll drop any results that have no data, and construct a new DataFrame of movies and their correlation score (similarity) to Star Wars:
73+
similarMovies = movieRatings.corrwith(starWarsRatings)
74+
similarMovies = similarMovies.dropna()
75+
df = pd.DataFrame(similarMovies)
76+
df.head(10)
77+
78+
# (That warning is safe to ignore.) Let's sort the results by similarity score, and we should have the movies most similar to Star Wars! Except... we don't. These results make no sense at all! This is why it's important to know your data - clearly we missed something important.
79+
similarMovies.sort_values(ascending=False)
80+
81+
# Our results are probably getting messed up by movies that have only been viewed by a handful of people who also happened to like Star Wars. So we need to get rid of movies that were only watched by a few people that are producing spurious results. Let's construct a new DataFrame that counts up how many ratings exist for each movie, and also the average rating while we're at it - that could also come in handy later.
82+
83+
movieStats = ratings.groupby('title').agg({'rating': [np.size, np.mean]})
84+
movieStats.head()
85+
86+
# Let's get rid of any movies rated by fewer than 100 people, and check the top-rated ones that are left:
87+
popularMovies = movieStats['rating']['size'] >= 150
88+
movieStats[popularMovies].sort_values([('rating', 'mean')], ascending=False)[:15]
89+
90+
#re.search(r'\((\d{4})\)', movieStats['rating']['title'])
91+
#newerpopularMovies = re.search(r'\((\d{4})\)', movieStats['rating']['title'])[1]
92+
93+
# 100 might still be too low, but these results look pretty good as far as "well rated movies that people have heard of." Let's join this data with our original set of similar movies to Star Wars:
94+
df = movieStats[popularMovies].join(pd.DataFrame(similarMovies, columns=['similarity']))
95+
df.head()
96+
97+
# And, sort these new results by similarity score. That's more like it!
98+
df.sort_values(['similarity'], ascending=False)[1:26]
99+
# Ideally we'd also filter out the movie we started from - of course Star Wars is 100% similar to itself. But otherwise these results aren't bad.
100+
101+
# ## Activity
102+
# 100 was an arbitrarily chosen cutoff. Try different values - what effect does it have on the end results?
103+
104+
105+
# UBCF
106+
107+
# this does the same but creates a float matrix instead of a dataframe
108+
n_users = ratings['user_id'].unique().shape[0]
109+
print(str(n_users) + ' users')
110+
n_items = ratings['movie_id'].unique().shape[0]
111+
print(str(n_items) + ' movies')
112+
movieRatingsZ = np.zeros((n_users, n_items))
113+
for row in ratings.itertuples():
114+
movieRatingsZ[row[1]-1, row[4]-1] = row[5] #1:user_id, 4:movie_id, 5:rating
115+
116+
movieRatings.shape
117+
movieRatingsZ.shape # has more columns?
118+
119+
import sklearn
120+
from sklearn.cross_validation import train_test_split
121+
ratings_train, ratings_test = train_test_split(movieRatingsZ,test_size=0.33, random_state=42)
122+
#Let's see the dimensions of the train set and the test set:
123+
ratings_train.shape
124+
ratings_test.shape
125+
126+
# create pairwise similarity calculations for each user in the rating matrix,
127+
#so, we have to calculate the similarity of each user with all the other users.
128+
129+
# passing the ratings_train set to cosine_distances creates a distances matrix
130+
dist_out = 1- sklearn.metrics.pairwise.cosine_distances(ratings_train)
131+
#the dimensions of the distance matris is a square matrix of size equal to the number of users
132+
dist_out.shape
133+
134+
# Predict the unknown ratings for an active user
135+
#unknown values can be calculated for all the users by taking the dot product
136+
#between the distance matrix and the rating matrix and then normalizing the
137+
#data with the number of ratings
138+
user_pred = dist_out.dot(ratings_train) / np.array([np.abs(dist_out).sum(axis=1)]).T
139+
140+
141+
from sklearn.metrics import mean_squared_error
142+
def get_mse(pred, actual):
143+
#Ignore nonzero terms.
144+
pred = pred[actual.nonzero()].flatten()
145+
actual = actual[actual.nonzero()].flatten()
146+
return mean_squared_error(pred, actual)
147+
148+
get_mse(user_pred, ratings_train) #~8.0 accuracy
149+
get_mse(user_pred, ratings_test) #~9.0 accuracy
150+
151+
152+
# use k-nearest neighbors instead
153+
from sklearn.neighbors import NearestNeighbors
154+
#lets compute the top five similar users by setting a variable, k.
155+
k=5
156+
neigh = NearestNeighbors(k,'cosine')
157+
neigh.fit(ratings_train)
158+
top_k_distances,top_k_users = neigh.kneighbors(ratings_train,
159+
return_distance=True)
160+
161+
#Let's see the top five users that are similar to user 1 in the training set:
162+
top_k_users[0]
163+
a_user_ratings = ratings[ratings['user_id'].isin([0])]
164+
165+
top_k_user_ratings = ratings[ratings['user_id'].isin([253, 134, 43, 602])]
166+
top_k_user_stats = top_k_user_ratings.groupby('title').agg({'rating': [np.size, np.mean]})
167+
top_k_user_stats.head()
168+
'list(top_k_user_stats.columns.values) #get the column names
169+
df1 = df.assign(top_k_user_stats=df[('rating', 'mean')] * top_k_user_stats[('rating', 'size')] / k)
170+
#list(df1.columns.values) #get the column names
171+
df2 = df1.sort_values([('top_k_user_stats', ''), ('rating', 'mean'),('rating', 'size')], ascending=False)
172+
df3 = df2[df2[('rating', 'mean')] >= 4.2]
173+
df3.head()
174+
175+
#Now choose only the top users for each user and use their rating #information
176+
#while predicting the ratings using the weighted sum of all of the ratings of these top similar users.
177+
user_pred_k = np.zeros(ratings_train.shape)
178+
for i in range(ratings_train.shape[0]):
179+
user_pred_k[i,:] = top_k_distances[i].T.dot(ratings_train[top_k_users][i]) / np.array([np.abs(top_k_distances[i].T).sum(axis=0)]).T
180+
181+
user_pred_k
182+
# Now let's see if the model has improved or not.
183+
get_mse(user_pred_k, ratings_train) #~8.0
184+
get_mse(user_pred_k, ratings_test) #~12.0

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy