jleetutorial · jleetutorial · Sep 29, 2017 · Sep 27, 2017 · Sep 27, 2017 · Sep 27, 2017
diff --git a/commons/Utils.py b/commons/Utils.py
@@ -0,0 +1,5 @@
+import re
+
+class Utils():
+
+    COMMA_DELIMITER = re.compile(''',(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''')
diff --git a/rdd/WordCount.py b/rdd/WordCount.py
@@ -2,9 +2,10 @@
 from pyspark import SparkContext
 
 if __name__ == "__main__":
-	sc = SparkContext("local", "word count")
-  	lines = sc.textFile("in/word_count.text")
-  	words = lines.flatMap(lambda line: line.split(" "))
-  	wordCounts = words.countByValue()
-  	for word, count in wordCounts.items():
-  		print(word, count)
+    sc = SparkContext("local", "word count")
+    sc.setLogLevel("ERROR")
+    lines = sc.textFile("in/word_count.text")
+    words = lines.flatMap(lambda line: line.split(" "))
+    wordCounts = words.countByValue()
+    for word, count in wordCounts.items():
+        print(word, count)
diff --git a/rdd/airports/AirportsByLatitudeProblem.py b/rdd/airports/AirportsByLatitudeProblem.py
@@ -0,0 +1,17 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    '''
+    Create a Spark program to read the airport data from in/airports.text,  find all the airports whose latitude are bigger than 40.
+    Then output the airport's name and the airport's latitude to out/airports_by_latitude.text.
+
+    Each row of the input file contains the following columns:
+    Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code,
+    ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format
+
+    Sample output:
+    "St Anthony", 51.391944
+    "Tofino", 49.082222
+    ...
+    '''
diff --git a/rdd/airports/AirportsByLatitudeProblem.scala b/rdd/airports/AirportsByLatitudeProblem.scala
diff --git a/rdd/airports/AirportsByLatitudeSolution.py b/rdd/airports/AirportsByLatitudeSolution.py
@@ -0,0 +1,17 @@
+from pyspark import SparkContext
+from commons.Utils import Utils
+
+def splitComma(line: str):
+    splits = Utils.COMMA_DELIMITER.split(line)
+    return "{}, {}".format(splits[1], splits[6])
+
+if __name__ == "__main__":
+    sc = SparkContext("local", "airports")
+
+    airports = sc.textFile("in/airports.text")
+
+    airportsInUSA = airports.filter(lambda line: float(Utils.COMMA_DELIMITER.split(line)[6]) > 40)
+
+    airportsNameAndCityNames = airportsInUSA.map(splitComma)
+
+    airportsNameAndCityNames.saveAsTextFile("out/airports_by_latitude.text")
diff --git a/rdd/airports/AirportsByLatitudeSolution.scala b/rdd/airports/AirportsByLatitudeSolution.scala
diff --git a/rdd/airports/AirportsInUsaProblem.py b/rdd/airports/AirportsInUsaProblem.py
@@ -0,0 +1,17 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    '''
+    Create a Spark program to read the airport data from in/airports.text, find all the airports which are located in United States
+    and output the airport's name and the city's name to out/airports_in_usa.text.
+
+    Each row of the input file contains the following columns:
+    Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code,
+    ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format
+
+    Sample output:
+    "Putnam County Airport", "Greencastle"
+    "Dowagiac Municipal Airport", "Dowagiac"
+    ...
+    '''
diff --git a/rdd/airports/AirportsInUsaProblem.scala b/rdd/airports/AirportsInUsaProblem.scala
diff --git a/rdd/airports/AirportsInUsaSolution.py b/rdd/airports/AirportsInUsaSolution.py
@@ -0,0 +1,15 @@
+from pyspark import SparkContext
+from commons.Utils import Utils
+
+def splitComma(line: str):
+    splits = Utils.COMMA_DELIMITER.split(line)
+    return "{}, {}".format(splits[1], splits[2])
+
+if __name__ == "__main__":
+    sc = SparkContext("local", "airports")
+
+    airports = sc.textFile("in/airports.text")
+    airportsInUSA = airports.filter(lambda line : Utils.COMMA_DELIMITER.split(line)[3] == "\"United States\"")
+
+    airportsNameAndCityNames = airportsInUSA.map(splitComma)
+    airportsNameAndCityNames.saveAsTextFile("out/airports_in_usa.text")
diff --git a/rdd/airports/AirportsInUsaSolution.scala b/rdd/airports/AirportsInUsaSolution.scala
diff --git a/rdd/collect/CollectExample.py b/rdd/collect/CollectExample.py
@@ -0,0 +1,10 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+    sc = SparkContext("local", "collect")
+    sc.setLogLevel("ERROR")
+    inputWords = ["spark", "hadoop", "spark", "hive", "pig", "cassandra", "hadoop"]
+    wordRdd = sc.parallelize(inputWords)
+    words = wordRdd.collect()
+    for word in words:
+        print(word)
diff --git a/rdd/collect/CollectExample.scala b/rdd/collect/CollectExample.scala
diff --git a/rdd/count/CountExample.py b/rdd/count/CountExample.py
@@ -0,0 +1,12 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+    sc = SparkContext("local", "count")
+    sc.setLogLevel("ERROR")
+    inputWords = ["spark", "hadoop", "spark", "hive", "pig", "cassandra", "hadoop"]
+    wordRdd = sc.parallelize(inputWords)
+    print("Count: {}".format(wordRdd.count()))
+    worldCountByValue = wordRdd.countByValue()
+    print("CountByValue: ")
+    for word, count in worldCountByValue.items():
+        print("{} : {}".format(word, count))
diff --git a/rdd/count/CountExample.scala b/rdd/count/CountExample.scala
diff --git a/rdd/nasaApacheWebLogs/SameHostsProblem.py b/rdd/nasaApacheWebLogs/SameHostsProblem.py
@@ -0,0 +1,20 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+	'''
+    "in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995.
+    "in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995
+    Create a Spark program to generate a new RDD which contains the hosts which are accessed on BOTH days.
+    Save the resulting RDD to "out/nasa_logs_same_hosts.csv" file.
+
+    Example output:
+    vagrant.vf.mmc.com
+    www-a1.proxy.aol.com
+    .....    
+
+    Keep in mind, that the original log files contains the following header lines.
+    host    logname    time    method    url    response    bytes
+
+    Make sure the head lines are removed in the resulting RDD.
+    '''
diff --git a/rdd/nasaApacheWebLogs/SameHostsProblem.scala b/rdd/nasaApacheWebLogs/SameHostsProblem.scala
diff --git a/rdd/nasaApacheWebLogs/SameHostsSolution.py b/rdd/nasaApacheWebLogs/SameHostsSolution.py
@@ -0,0 +1,15 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+    sc = SparkContext("local", "sameHosts")
+
+    julyFirstLogs = sc.textFile("in/nasa_19950701.tsv")
+    augustFirstLogs = sc.textFile("in/nasa_19950801.tsv")
+
+    julyFirstHosts = julyFirstLogs.map(lambda line: line.split("\t")[0])
+    augustFirstHosts = augustFirstLogs.map(lambda line: line.split("\t")[0])
+
+    intersection = julyFirstHosts.intersection(augustFirstHosts)
+
+    cleanedHostIntersection = intersection.filter(lambda host: host != "host")
+    cleanedHostIntersection.saveAsTextFile("out/nasa_logs_same_hosts.csv")
diff --git a/rdd/nasaApacheWebLogs/SameHostsSolution.scala b/rdd/nasaApacheWebLogs/SameHostsSolution.scala
diff --git a/rdd/nasaApacheWebLogs/UnionLogProblem.py b/rdd/nasaApacheWebLogs/UnionLogProblem.py
@@ -0,0 +1,15 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    '''
+    "in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995.
+    "in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995
+    Create a Spark program to generate a new RDD which contains the log lines from both July 1st and August 1st,
+    take a 0.1 sample of those log lines and save it to "out/sample_nasa_logs.tsv" file.
+
+    Keep in mind, that the original log files contains the following header lines.
+    host    logname    time    method    url    response    bytes
+
+    Make sure the head lines are removed in the resulting RDD.
+    '''
diff --git a/rdd/nasaApacheWebLogs/UnionLogProblem.scala b/rdd/nasaApacheWebLogs/UnionLogProblem.scala
diff --git a/rdd/nasaApacheWebLogs/UnionLogSolutions.py b/rdd/nasaApacheWebLogs/UnionLogSolutions.py
@@ -0,0 +1,17 @@
+from pyspark import SparkContext
+
+def isNotHeader(line: str):
+    return not (line.startswith("host") and "bytes" in line)
+
+if __name__ == "__main__":
+    sc = SparkContext("local", "unionLogs")
+
+    julyFirstLogs = sc.textFile("in/nasa_19950701.tsv")
+    augustFirstLogs = sc.textFile("in/nasa_19950801.tsv")
+
+    aggregatedLogLines = julyFirstLogs.union(augustFirstLogs)
+
+    cleanLogLines = aggregatedLogLines.filter(isNotHeader)
+    sample = cleanLogLines.sample(withReplacement = True, fraction = 0.1)
+
+    sample.saveAsTextFile("out/sample_nasa_logs.csv")