Added rdd/nasaApacheWebLogs/*.py

Pedro Bernardo · Pedro Bernardo · commit 652d9ffca835 · 2017-09-28T12:01:53.000+02:00
diff --git a/rdd/nasaApacheWebLogs/SameHostsProblem.py b/rdd/nasaApacheWebLogs/SameHostsProblem.py
@@ -0,0 +1,20 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+	'''
+    "in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995.
+    "in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995
+    Create a Spark program to generate a new RDD which contains the hosts which are accessed on BOTH days.
+    Save the resulting RDD to "out/nasa_logs_same_hosts.csv" file.
+
+    Example output:
+    vagrant.vf.mmc.com
+    www-a1.proxy.aol.com
+    .....    
+
+    Keep in mind, that the original log files contains the following header lines.
+    host    logname    time    method    url    response    bytes
+
+    Make sure the head lines are removed in the resulting RDD.
+    '''
diff --git a/rdd/nasaApacheWebLogs/SameHostsSolution.py b/rdd/nasaApacheWebLogs/SameHostsSolution.py
@@ -0,0 +1,15 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+    sc = SparkContext("local", "sameHosts")
+
+    julyFirstLogs = sc.textFile("in/nasa_19950701.tsv")
+    augustFirstLogs = sc.textFile("in/nasa_19950801.tsv")
+    
+    julyFirstHosts = julyFirstLogs.map(lambda line: line.split("\t")[0])
+    augustFirstHosts = augustFirstLogs.map(lambda line: line.split("\t")[0])
+    
+    intersection = julyFirstHosts.intersection(augustFirstHosts)
+    
+    cleanedHostIntersection = intersection.filter(lambda host: host != "host")
+    cleanedHostIntersection.saveAsTextFile("out/nasa_logs_same_hosts.csv")
diff --git a/rdd/nasaApacheWebLogs/UnionLogProblem.py b/rdd/nasaApacheWebLogs/UnionLogProblem.py
@@ -0,0 +1,15 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    '''
+    "in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995.
+    "in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995
+    Create a Spark program to generate a new RDD which contains the log lines from both July 1st and August 1st,
+    take a 0.1 sample of those log lines and save it to "out/sample_nasa_logs.tsv" file.
+
+    Keep in mind, that the original log files contains the following header lines.
+    host    logname    time    method    url    response    bytes
+
+    Make sure the head lines are removed in the resulting RDD.
+    '''
diff --git a/rdd/nasaApacheWebLogs/UnionLogSolutions.py b/rdd/nasaApacheWebLogs/UnionLogSolutions.py
@@ -0,0 +1,17 @@
+from pyspark import SparkContext
+
+def isNotHeader(line:str):
+    return not (line.startswith("host") and "bytes" in line)
+
+if __name__ == "__main__":
+    sc = SparkContext("local", "unionLogs")
+
+    julyFirstLogs = sc.textFile("in/nasa_19950701.tsv")
+    augustFirstLogs = sc.textFile("in/nasa_19950801.tsv")
+
+    aggregatedLogLines = julyFirstLogs.union(augustFirstLogs)
+
+    cleanLogLines = aggregatedLogLines.filter(isNotHeader)
+    sample = cleanLogLines.sample(withReplacement = True, fraction = 0.1)
+
+    sample.saveAsTextFile("out/sample_nasa_logs.csv")