File tree Expand file tree Collapse file tree 4 files changed +67
-0
lines changed Expand file tree Collapse file tree 4 files changed +67
-0
lines changed Original file line number Diff line number Diff line change
1
+ from pyspark import SparkContext
2
+
3
+ if __name__ == "__main__" :
4
+
5
+ '''
6
+ "in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995.
7
+ "in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995
8
+ Create a Spark program to generate a new RDD which contains the hosts which are accessed on BOTH days.
9
+ Save the resulting RDD to "out/nasa_logs_same_hosts.csv" file.
10
+
11
+ Example output:
12
+ vagrant.vf.mmc.com
13
+ www-a1.proxy.aol.com
14
+ .....
15
+
16
+ Keep in mind, that the original log files contains the following header lines.
17
+ host logname time method url response bytes
18
+
19
+ Make sure the head lines are removed in the resulting RDD.
20
+ '''
Original file line number Diff line number Diff line change
1
+ from pyspark import SparkContext
2
+
3
+ if __name__ == "__main__" :
4
+ sc = SparkContext ("local" , "sameHosts" )
5
+
6
+ julyFirstLogs = sc .textFile ("in/nasa_19950701.tsv" )
7
+ augustFirstLogs = sc .textFile ("in/nasa_19950801.tsv" )
8
+
9
+ julyFirstHosts = julyFirstLogs .map (lambda line : line .split ("\t " )[0 ])
10
+ augustFirstHosts = augustFirstLogs .map (lambda line : line .split ("\t " )[0 ])
11
+
12
+ intersection = julyFirstHosts .intersection (augustFirstHosts )
13
+
14
+ cleanedHostIntersection = intersection .filter (lambda host : host != "host" )
15
+ cleanedHostIntersection .saveAsTextFile ("out/nasa_logs_same_hosts.csv" )
Original file line number Diff line number Diff line change
1
+ from pyspark import SparkContext
2
+
3
+ if __name__ == "__main__" :
4
+
5
+ '''
6
+ "in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995.
7
+ "in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995
8
+ Create a Spark program to generate a new RDD which contains the log lines from both July 1st and August 1st,
9
+ take a 0.1 sample of those log lines and save it to "out/sample_nasa_logs.tsv" file.
10
+
11
+ Keep in mind, that the original log files contains the following header lines.
12
+ host logname time method url response bytes
13
+
14
+ Make sure the head lines are removed in the resulting RDD.
15
+ '''
Original file line number Diff line number Diff line change
1
+ from pyspark import SparkContext
2
+
3
+ def isNotHeader (line :str ):
4
+ return not (line .startswith ("host" ) and "bytes" in line )
5
+
6
+ if __name__ == "__main__" :
7
+ sc = SparkContext ("local" , "unionLogs" )
8
+
9
+ julyFirstLogs = sc .textFile ("in/nasa_19950701.tsv" )
10
+ augustFirstLogs = sc .textFile ("in/nasa_19950801.tsv" )
11
+
12
+ aggregatedLogLines = julyFirstLogs .union (augustFirstLogs )
13
+
14
+ cleanLogLines = aggregatedLogLines .filter (isNotHeader )
15
+ sample = cleanLogLines .sample (withReplacement = True , fraction = 0.1 )
16
+
17
+ sample .saveAsTextFile ("out/sample_nasa_logs.csv" )
You can’t perform that action at this time.
0 commit comments