From 8838805d14bb61a0fc8a79526334d424ce5dd5e0 Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Wed, 27 Sep 2017 18:10:05 +0200 Subject: [PATCH 01/10] Added rdd/reduce/ReduceExample.py --- rdd/reduce/ReduceExample.py | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 rdd/reduce/ReduceExample.py diff --git a/rdd/reduce/ReduceExample.py b/rdd/reduce/ReduceExample.py new file mode 100644 index 00000000..717e97d3 --- /dev/null +++ b/rdd/reduce/ReduceExample.py @@ -0,0 +1,8 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + sc = SparkContext("local", "reduce") + inputIntegers = [1, 2, 3, 4, 5] + integerRdd = sc.parallelize(inputIntegers) + product = integerRdd.reduce(lambda x, y: x * y) + print("product is :{}".format(product)) From 4d7b7a4d04c90319b92d97c4dceda6f42950bf0e Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Wed, 27 Sep 2017 18:11:23 +0200 Subject: [PATCH 02/10] Added rdd/count/CountExample.py --- rdd/count/CountExample.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 rdd/count/CountExample.py diff --git a/rdd/count/CountExample.py b/rdd/count/CountExample.py new file mode 100644 index 00000000..a9f35316 --- /dev/null +++ b/rdd/count/CountExample.py @@ -0,0 +1,11 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + sc = SparkContext("local", "count") + inputWords = ["spark", "hadoop", "spark", "hive", "pig", "cassandra", "hadoop"] + wordRdd = sc.parallelize(inputWords) + print("Count: {}".format(wordRdd.count())) + worldCountByValue = wordRdd.countByValue() + print("CountByValue: ") + for word, count in worldCountByValue.items(): + print("{} : {}".format(word, count)) \ No newline at end of file From 7b40998c2217bead4d135987cf7cfc593dee6672 Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Wed, 27 Sep 2017 18:16:33 +0200 Subject: [PATCH 03/10] Added rdd/collect/CollectExample.py --- rdd/collect/CollectExample.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 rdd/collect/CollectExample.py diff --git a/rdd/collect/CollectExample.py b/rdd/collect/CollectExample.py new file mode 100644 index 00000000..431aadfd --- /dev/null +++ b/rdd/collect/CollectExample.py @@ -0,0 +1,9 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + sc = SparkContext("local", "collect") + inputWords = ["spark", "hadoop", "spark", "hive", "pig", "cassandra", "hadoop"] + wordRdd = sc.parallelize(inputWords) + words = wordRdd.collect() + for word in words: + print(word) \ No newline at end of file From 652d9ffca835b30f48943f07cb1e1a2d7c157ebc Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Thu, 28 Sep 2017 12:01:53 +0200 Subject: [PATCH 04/10] Added rdd/nasaApacheWebLogs/*.py --- rdd/nasaApacheWebLogs/SameHostsProblem.py | 20 ++++++++++++++++++++ rdd/nasaApacheWebLogs/SameHostsSolution.py | 15 +++++++++++++++ rdd/nasaApacheWebLogs/UnionLogProblem.py | 15 +++++++++++++++ rdd/nasaApacheWebLogs/UnionLogSolutions.py | 17 +++++++++++++++++ 4 files changed, 67 insertions(+) create mode 100644 rdd/nasaApacheWebLogs/SameHostsProblem.py create mode 100644 rdd/nasaApacheWebLogs/SameHostsSolution.py create mode 100644 rdd/nasaApacheWebLogs/UnionLogProblem.py create mode 100644 rdd/nasaApacheWebLogs/UnionLogSolutions.py diff --git a/rdd/nasaApacheWebLogs/SameHostsProblem.py b/rdd/nasaApacheWebLogs/SameHostsProblem.py new file mode 100644 index 00000000..cf728235 --- /dev/null +++ b/rdd/nasaApacheWebLogs/SameHostsProblem.py @@ -0,0 +1,20 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + ''' + "in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995. + "in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995 + Create a Spark program to generate a new RDD which contains the hosts which are accessed on BOTH days. + Save the resulting RDD to "out/nasa_logs_same_hosts.csv" file. + + Example output: + vagrant.vf.mmc.com + www-a1.proxy.aol.com + ..... + + Keep in mind, that the original log files contains the following header lines. + host logname time method url response bytes + + Make sure the head lines are removed in the resulting RDD. + ''' diff --git a/rdd/nasaApacheWebLogs/SameHostsSolution.py b/rdd/nasaApacheWebLogs/SameHostsSolution.py new file mode 100644 index 00000000..7081dce9 --- /dev/null +++ b/rdd/nasaApacheWebLogs/SameHostsSolution.py @@ -0,0 +1,15 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + sc = SparkContext("local", "sameHosts") + + julyFirstLogs = sc.textFile("in/nasa_19950701.tsv") + augustFirstLogs = sc.textFile("in/nasa_19950801.tsv") + + julyFirstHosts = julyFirstLogs.map(lambda line: line.split("\t")[0]) + augustFirstHosts = augustFirstLogs.map(lambda line: line.split("\t")[0]) + + intersection = julyFirstHosts.intersection(augustFirstHosts) + + cleanedHostIntersection = intersection.filter(lambda host: host != "host") + cleanedHostIntersection.saveAsTextFile("out/nasa_logs_same_hosts.csv") diff --git a/rdd/nasaApacheWebLogs/UnionLogProblem.py b/rdd/nasaApacheWebLogs/UnionLogProblem.py new file mode 100644 index 00000000..1d7a783b --- /dev/null +++ b/rdd/nasaApacheWebLogs/UnionLogProblem.py @@ -0,0 +1,15 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + ''' + "in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995. + "in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995 + Create a Spark program to generate a new RDD which contains the log lines from both July 1st and August 1st, + take a 0.1 sample of those log lines and save it to "out/sample_nasa_logs.tsv" file. + + Keep in mind, that the original log files contains the following header lines. + host logname time method url response bytes + + Make sure the head lines are removed in the resulting RDD. + ''' diff --git a/rdd/nasaApacheWebLogs/UnionLogSolutions.py b/rdd/nasaApacheWebLogs/UnionLogSolutions.py new file mode 100644 index 00000000..067e2579 --- /dev/null +++ b/rdd/nasaApacheWebLogs/UnionLogSolutions.py @@ -0,0 +1,17 @@ +from pyspark import SparkContext + +def isNotHeader(line:str): + return not (line.startswith("host") and "bytes" in line) + +if __name__ == "__main__": + sc = SparkContext("local", "unionLogs") + + julyFirstLogs = sc.textFile("in/nasa_19950701.tsv") + augustFirstLogs = sc.textFile("in/nasa_19950801.tsv") + + aggregatedLogLines = julyFirstLogs.union(augustFirstLogs) + + cleanLogLines = aggregatedLogLines.filter(isNotHeader) + sample = cleanLogLines.sample(withReplacement = True, fraction = 0.1) + + sample.saveAsTextFile("out/sample_nasa_logs.csv") \ No newline at end of file From 80c20eda168703ddd053886b36374dc421a5795e Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Thu, 28 Sep 2017 12:02:32 +0200 Subject: [PATCH 05/10] Added rdd/persist/PersistExample.py --- rdd/persist/PersistExample.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 rdd/persist/PersistExample.py diff --git a/rdd/persist/PersistExample.py b/rdd/persist/PersistExample.py new file mode 100644 index 00000000..947a6e14 --- /dev/null +++ b/rdd/persist/PersistExample.py @@ -0,0 +1,9 @@ +from pyspark import SparkContext, StorageLevel + +if __name__ == "__main__": + sc = SparkContext("local", "persist") + inputIntegers = [1, 2, 3, 4, 5] + integerRdd = sc.parallelize(inputIntegers) + integerRdd.persist(StorageLevel.MEMORY_ONLY) + integerRdd.reduce(lambda x, y: x*y) + integerRdd.count() From 08b146a2cd6937dd4e3a8f44040deb26905890f0 Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Thu, 28 Sep 2017 16:12:42 +0200 Subject: [PATCH 06/10] Added commons/Utils.py --- commons/Utils.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 commons/Utils.py diff --git a/commons/Utils.py b/commons/Utils.py new file mode 100644 index 00000000..2038566d --- /dev/null +++ b/commons/Utils.py @@ -0,0 +1,5 @@ +import re + +class Utils(): + + COMMA_DELIMITER = re.compile(''',(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''') From afc939b980cf048cd6ccc7923c1b41530c88d353 Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Thu, 28 Sep 2017 16:13:48 +0200 Subject: [PATCH 07/10] Added rdd/airports/*.py --- rdd/airports/AirportsByLatitudeProblem.py | 17 +++++++++++++++++ rdd/airports/AirportsByLatitudeSolution.py | 17 +++++++++++++++++ rdd/airports/AirportsInUsaProblem.py | 17 +++++++++++++++++ rdd/airports/AirportsInUsaSolution.py | 15 +++++++++++++++ 4 files changed, 66 insertions(+) create mode 100644 rdd/airports/AirportsByLatitudeProblem.py create mode 100644 rdd/airports/AirportsByLatitudeSolution.py create mode 100644 rdd/airports/AirportsInUsaProblem.py create mode 100644 rdd/airports/AirportsInUsaSolution.py diff --git a/rdd/airports/AirportsByLatitudeProblem.py b/rdd/airports/AirportsByLatitudeProblem.py new file mode 100644 index 00000000..306f4a7b --- /dev/null +++ b/rdd/airports/AirportsByLatitudeProblem.py @@ -0,0 +1,17 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + ''' + Create a Spark program to read the airport data from in/airports.text, find all the airports whose latitude are bigger than 40. + Then output the airport's name and the airport's latitude to out/airports_by_latitude.text. + + Each row of the input file contains the following columns: + Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code, + ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format + + Sample output: + "St Anthony", 51.391944 + "Tofino", 49.082222 + ... + ''' diff --git a/rdd/airports/AirportsByLatitudeSolution.py b/rdd/airports/AirportsByLatitudeSolution.py new file mode 100644 index 00000000..2ce476d5 --- /dev/null +++ b/rdd/airports/AirportsByLatitudeSolution.py @@ -0,0 +1,17 @@ +from pyspark import SparkContext +from commons.Utils import Utils + +def splitComma(line: str): + splits = Utils.COMMA_DELIMITER.split(line) + return "{}, {}".format(splits[1], splits[6]) + +if __name__ == "__main__": + sc = SparkContext("local", "airports") + + airports = sc.textFile("in/airports.text") + + airportsInUSA = airports.filter(lambda line: float(Utils.COMMA_DELIMITER.split(line)[6]) > 40) + + airportsNameAndCityNames = airportsInUSA.map(splitComma) + + airportsNameAndCityNames.saveAsTextFile("out/airports_by_latitude.text") \ No newline at end of file diff --git a/rdd/airports/AirportsInUsaProblem.py b/rdd/airports/AirportsInUsaProblem.py new file mode 100644 index 00000000..4e76489c --- /dev/null +++ b/rdd/airports/AirportsInUsaProblem.py @@ -0,0 +1,17 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + ''' + Create a Spark program to read the airport data from in/airports.text, find all the airports which are located in United States + and output the airport's name and the city's name to out/airports_in_usa.text. + + Each row of the input file contains the following columns: + Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code, + ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format + + Sample output: + "Putnam County Airport", "Greencastle" + "Dowagiac Municipal Airport", "Dowagiac" + ... + ''' diff --git a/rdd/airports/AirportsInUsaSolution.py b/rdd/airports/AirportsInUsaSolution.py new file mode 100644 index 00000000..044a29a1 --- /dev/null +++ b/rdd/airports/AirportsInUsaSolution.py @@ -0,0 +1,15 @@ +from pyspark import SparkContext +from commons.Utils import Utils + +def splitComma(line: str): + splits = Utils.COMMA_DELIMITER.split(line) + return "{}, {}".format(splits[1], splits[2]) + +if __name__ == "__main__": + sc = SparkContext("local", "count") + + airports = sc.textFile("in/airports.text") + airportsInUSA = airports.filter(lambda line : Utils.COMMA_DELIMITER.split(line)[3] == "\"United States\"") + + airportsNameAndCityNames = airportsInUSA.map(splitComma) + airportsNameAndCityNames.saveAsTextFile("out/airports_in_usa.text") From c7168ce247b5f4e705d96c24f4674ccabb8d01e2 Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Thu, 28 Sep 2017 16:22:30 +0200 Subject: [PATCH 08/10] Fixed app name on rdd/airports/AirportsInUsaSolution.py --- rdd/airports/AirportsInUsaSolution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdd/airports/AirportsInUsaSolution.py b/rdd/airports/AirportsInUsaSolution.py index 044a29a1..96ec733b 100644 --- a/rdd/airports/AirportsInUsaSolution.py +++ b/rdd/airports/AirportsInUsaSolution.py @@ -6,7 +6,7 @@ def splitComma(line: str): return "{}, {}".format(splits[1], splits[2]) if __name__ == "__main__": - sc = SparkContext("local", "count") + sc = SparkContext("local", "airports") airports = sc.textFile("in/airports.text") airportsInUSA = airports.filter(lambda line : Utils.COMMA_DELIMITER.split(line)[3] == "\"United States\"") From 131e3cf205be8f47d6ddcea335fa271f79d16691 Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Thu, 28 Sep 2017 16:23:07 +0200 Subject: [PATCH 09/10] Finished rdd | Removed scala files from rdd --- rdd/airports/AirportsByLatitudeProblem.scala | 20 -------------- rdd/airports/AirportsByLatitudeSolution.scala | 23 ---------------- rdd/airports/AirportsInUsaProblem.scala | 19 -------------- rdd/airports/AirportsInUsaSolution.scala | 22 ---------------- rdd/collect/CollectExample.scala | 20 -------------- rdd/count/CountExample.scala | 23 ---------------- rdd/nasaApacheWebLogs/SameHostsProblem.scala | 23 ---------------- rdd/nasaApacheWebLogs/SameHostsSolution.scala | 23 ---------------- rdd/nasaApacheWebLogs/UnionLogProblem.scala | 18 ------------- rdd/nasaApacheWebLogs/UnionLogsSolution.scala | 26 ------------------- rdd/persist/PersistExample.scala | 22 ---------------- rdd/reduce/ReduceExample.scala | 18 ------------- 12 files changed, 257 deletions(-) delete mode 100644 rdd/airports/AirportsByLatitudeProblem.scala delete mode 100644 rdd/airports/AirportsByLatitudeSolution.scala delete mode 100644 rdd/airports/AirportsInUsaProblem.scala delete mode 100644 rdd/airports/AirportsInUsaSolution.scala delete mode 100644 rdd/collect/CollectExample.scala delete mode 100644 rdd/count/CountExample.scala delete mode 100644 rdd/nasaApacheWebLogs/SameHostsProblem.scala delete mode 100644 rdd/nasaApacheWebLogs/SameHostsSolution.scala delete mode 100644 rdd/nasaApacheWebLogs/UnionLogProblem.scala delete mode 100644 rdd/nasaApacheWebLogs/UnionLogsSolution.scala delete mode 100644 rdd/persist/PersistExample.scala delete mode 100644 rdd/reduce/ReduceExample.scala diff --git a/rdd/airports/AirportsByLatitudeProblem.scala b/rdd/airports/AirportsByLatitudeProblem.scala deleted file mode 100644 index 8af9b1cd..00000000 --- a/rdd/airports/AirportsByLatitudeProblem.scala +++ /dev/null @@ -1,20 +0,0 @@ -package com.sparkTutorial.rdd.airports - -object AirportsByLatitudeProblem { - - def main(args: Array[String]) { - - /* Create a Spark program to read the airport data from in/airports.text, find all the airports whose latitude are bigger than 40. - Then output the airport's name and the airport's latitude to out/airports_by_latitude.text. - - Each row of the input file contains the following columns: - Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code, - ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format - - Sample output: - "St Anthony", 51.391944 - "Tofino", 49.082222 - ... - */ - } -} diff --git a/rdd/airports/AirportsByLatitudeSolution.scala b/rdd/airports/AirportsByLatitudeSolution.scala deleted file mode 100644 index 13faf3c9..00000000 --- a/rdd/airports/AirportsByLatitudeSolution.scala +++ /dev/null @@ -1,23 +0,0 @@ -package com.sparkTutorial.rdd.airports - -import com.sparkTutorial.commons.Utils -import org.apache.spark.{SparkConf, SparkContext} - -object AirportsByLatitudeSolution { - - def main(args: Array[String]) { - - val conf = new SparkConf().setAppName("airports").setMaster("local[2]") - val sc = new SparkContext(conf) - - val airports = sc.textFile("in/airports.text") - val airportsInUSA = airports.filter(line => line.split(Utils.COMMA_DELIMITER)(6).toFloat > 40) - - val airportsNameAndCityNames = airportsInUSA.map(line => { - val splits = line.split(Utils.COMMA_DELIMITER) - splits(1) + ", " + splits(6) - }) - - airportsNameAndCityNames.saveAsTextFile("out/airports_by_latitude.text") - } -} diff --git a/rdd/airports/AirportsInUsaProblem.scala b/rdd/airports/AirportsInUsaProblem.scala deleted file mode 100644 index 89dd6f5f..00000000 --- a/rdd/airports/AirportsInUsaProblem.scala +++ /dev/null @@ -1,19 +0,0 @@ -package com.sparkTutorial.rdd.airports - -object AirportsInUsaProblem { - def main(args: Array[String]) { - - /* Create a Spark program to read the airport data from in/airports.text, find all the airports which are located in United States - and output the airport's name and the city's name to out/airports_in_usa.text. - - Each row of the input file contains the following columns: - Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code, - ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format - - Sample output: - "Putnam County Airport", "Greencastle" - "Dowagiac Municipal Airport", "Dowagiac" - ... - */ - } -} diff --git a/rdd/airports/AirportsInUsaSolution.scala b/rdd/airports/AirportsInUsaSolution.scala deleted file mode 100644 index 04621547..00000000 --- a/rdd/airports/AirportsInUsaSolution.scala +++ /dev/null @@ -1,22 +0,0 @@ -package com.sparkTutorial.rdd.airports - -import com.sparkTutorial.commons.Utils -import org.apache.spark.{SparkConf, SparkContext} - -object AirportsInUsaSolution { - - def main(args: Array[String]) { - - val conf = new SparkConf().setAppName("airports").setMaster("local[2]") - val sc = new SparkContext(conf) - - val airports = sc.textFile("in/airports.text") - val airportsInUSA = airports.filter(line => line.split(Utils.COMMA_DELIMITER)(3) == "\"United States\"") - - val airportsNameAndCityNames = airportsInUSA.map(line => { - val splits = line.split(Utils.COMMA_DELIMITER) - splits(1) + ", " + splits(2) - }) - airportsNameAndCityNames.saveAsTextFile("out/airports_in_usa.text") - } -} diff --git a/rdd/collect/CollectExample.scala b/rdd/collect/CollectExample.scala deleted file mode 100644 index 1a450778..00000000 --- a/rdd/collect/CollectExample.scala +++ /dev/null @@ -1,20 +0,0 @@ -package com.sparkTutorial.rdd.collect - -import org.apache.log4j.Level -import org.apache.log4j.Logger -import org.apache.spark.{SparkConf, SparkContext} - -object CollectExample { - def main(args: Array[String]) { - Logger.getLogger("org").setLevel(Level.ERROR) - val conf = new SparkConf().setAppName("collect").setMaster("local[*]") - val sc = new SparkContext(conf) - - val inputWords = List("spark", "hadoop", "spark", "hive", "pig", "cassandra", "hadoop") - val wordRdd = sc.parallelize(inputWords) - - val words = wordRdd.collect() - - for (word <- words) println(word) - } -} diff --git a/rdd/count/CountExample.scala b/rdd/count/CountExample.scala deleted file mode 100644 index 1993183b..00000000 --- a/rdd/count/CountExample.scala +++ /dev/null @@ -1,23 +0,0 @@ -package com.sparkTutorial.rdd.count - -import org.apache.log4j.Level -import org.apache.log4j.Logger -import org.apache.spark.{SparkConf, SparkContext} - -object CountExample { - - def main(args: Array[String]) { - Logger.getLogger("org").setLevel(Level.ERROR) - val conf = new SparkConf().setAppName("count").setMaster("local[*]") - val sc = new SparkContext(conf) - - val inputWords = List("spark", "hadoop", "spark", "hive", "pig", "cassandra", "hadoop") - val wordRdd = sc.parallelize(inputWords) - println("Count: " + wordRdd.count()) - - val wordCountByValue = wordRdd.countByValue() - println("CountByValue:") - - for ((word, count) <- wordCountByValue) println(word + " : " + count) - } -} diff --git a/rdd/nasaApacheWebLogs/SameHostsProblem.scala b/rdd/nasaApacheWebLogs/SameHostsProblem.scala deleted file mode 100644 index d55a8b82..00000000 --- a/rdd/nasaApacheWebLogs/SameHostsProblem.scala +++ /dev/null @@ -1,23 +0,0 @@ -package com.sparkTutorial.rdd.nasaApacheWebLogs - -object SameHostsProblem { - - def main(args: Array[String]) { - - /* "in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995. - "in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995 - Create a Spark program to generate a new RDD which contains the hosts which are accessed on BOTH days. - Save the resulting RDD to "out/nasa_logs_same_hosts.csv" file. - - Example output: - vagrant.vf.mmc.com - www-a1.proxy.aol.com - ..... - - Keep in mind, that the original log files contains the following header lines. - host logname time method url response bytes - - Make sure the head lines are removed in the resulting RDD. - */ - } -} diff --git a/rdd/nasaApacheWebLogs/SameHostsSolution.scala b/rdd/nasaApacheWebLogs/SameHostsSolution.scala deleted file mode 100644 index 168f9454..00000000 --- a/rdd/nasaApacheWebLogs/SameHostsSolution.scala +++ /dev/null @@ -1,23 +0,0 @@ -package com.sparkTutorial.rdd.nasaApacheWebLogs - -import org.apache.spark.{SparkConf, SparkContext} - -object SameHostsSolution { - - def main(args: Array[String]) { - - val conf = new SparkConf().setAppName("sameHosts").setMaster("local[1]") - val sc = new SparkContext(conf) - - val julyFirstLogs = sc.textFile("in/nasa_19950701.tsv") - val augustFirstLogs = sc.textFile("in/nasa_19950801.tsv") - - val julyFirstHosts = julyFirstLogs.map(line => line.split("\t")(0)) - val augustFirstHosts = augustFirstLogs.map(line => line.split("\t")(0)) - - val intersection = julyFirstHosts.intersection(augustFirstHosts) - - val cleanedHostIntersection = intersection.filter(host => host != "host") - cleanedHostIntersection.saveAsTextFile("out/nasa_logs_same_hosts.csv") - } -} diff --git a/rdd/nasaApacheWebLogs/UnionLogProblem.scala b/rdd/nasaApacheWebLogs/UnionLogProblem.scala deleted file mode 100644 index d56dc986..00000000 --- a/rdd/nasaApacheWebLogs/UnionLogProblem.scala +++ /dev/null @@ -1,18 +0,0 @@ -package com.sparkTutorial.rdd.nasaApacheWebLogs - -object UnionLogProblem { - - def main(args: Array[String]) { - - /* "in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995. - "in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995 - Create a Spark program to generate a new RDD which contains the log lines from both July 1st and August 1st, - take a 0.1 sample of those log lines and save it to "out/sample_nasa_logs.tsv" file. - - Keep in mind, that the original log files contains the following header lines. - host logname time method url response bytes - - Make sure the head lines are removed in the resulting RDD. - */ - } -} diff --git a/rdd/nasaApacheWebLogs/UnionLogsSolution.scala b/rdd/nasaApacheWebLogs/UnionLogsSolution.scala deleted file mode 100644 index f6db5473..00000000 --- a/rdd/nasaApacheWebLogs/UnionLogsSolution.scala +++ /dev/null @@ -1,26 +0,0 @@ -package com.sparkTutorial.rdd.nasaApacheWebLogs - -import org.apache.spark.{SparkConf, SparkContext} - -object UnionLogsSolution { - - def main(args: Array[String]) { - - val conf = new SparkConf().setAppName("unionLogs").setMaster("local[*]") - - val sc = new SparkContext(conf) - - val julyFirstLogs = sc.textFile("in/nasa_19950701.tsv") - val augustFirstLogs = sc.textFile("in/nasa_19950801.tsv") - - val aggregatedLogLines = julyFirstLogs.union(augustFirstLogs) - - val cleanLogLines = aggregatedLogLines.filter(line => isNotHeader(line)) - - val sample = cleanLogLines.sample(withReplacement = true, fraction = 0.1) - - sample.saveAsTextFile("out/sample_nasa_logs.csv") - } - - def isNotHeader(line: String): Boolean = !(line.startsWith("host") && line.contains("bytes")) -} diff --git a/rdd/persist/PersistExample.scala b/rdd/persist/PersistExample.scala deleted file mode 100644 index 48ff5821..00000000 --- a/rdd/persist/PersistExample.scala +++ /dev/null @@ -1,22 +0,0 @@ -package com.sparkTutorial.rdd.persist - -import org.apache.log4j.{Level, Logger} -import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.storage.StorageLevel - -object PersistExample { - - def main(args: Array[String]) { - Logger.getLogger("org").setLevel(Level.ERROR) - val conf = new SparkConf().setAppName("reduce").setMaster("local[*]") - val sc = new SparkContext(conf) - - val inputIntegers = List(1, 2, 3, 4, 5) - val integerRdd = sc.parallelize(inputIntegers) - - integerRdd.persist(StorageLevel.MEMORY_ONLY) - - integerRdd.reduce((x, y) => x * y) - integerRdd.count() - } -} diff --git a/rdd/reduce/ReduceExample.scala b/rdd/reduce/ReduceExample.scala deleted file mode 100644 index 53b863b9..00000000 --- a/rdd/reduce/ReduceExample.scala +++ /dev/null @@ -1,18 +0,0 @@ -package com.sparkTutorial.rdd.reduce -import org.apache.log4j.{Level, Logger} -import org.apache.spark.{SparkConf, SparkContext} - -object ReduceExample { - - def main(args: Array[String]) { - Logger.getLogger("org").setLevel(Level.OFF) - val conf = new SparkConf().setAppName("reduce").setMaster("local[*]") - val sc = new SparkContext(conf) - - val inputIntegers = List(1, 2, 3, 4, 5) - val integerRdd = sc.parallelize(inputIntegers) - - val product = integerRdd.reduce((x, y) => x * y) - println("product is :" + product) - } -} From f637b18a6405ead4fc89fb6c6967747710d2ba0f Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Fri, 29 Sep 2017 08:23:40 +0200 Subject: [PATCH 10/10] Setting log level to ERROR in scripts that prints to the standard output --- rdd/WordCount.py | 13 +++++++------ rdd/collect/CollectExample.py | 1 + rdd/count/CountExample.py | 3 ++- rdd/nasaApacheWebLogs/UnionLogSolutions.py | 2 +- rdd/reduce/ReduceExample.py | 1 + rdd/sumOfNumbers/SumOfNumbersProblem.py | 5 ++--- rdd/sumOfNumbers/SumOfNumbersSolution.py | 3 ++- rdd/take/TakeExample.py | 13 +++++++------ 8 files changed, 23 insertions(+), 18 deletions(-) diff --git a/rdd/WordCount.py b/rdd/WordCount.py index ac6e5e58..c95a04f1 100644 --- a/rdd/WordCount.py +++ b/rdd/WordCount.py @@ -2,9 +2,10 @@ from pyspark import SparkContext if __name__ == "__main__": - sc = SparkContext("local", "word count") - lines = sc.textFile("in/word_count.text") - words = lines.flatMap(lambda line: line.split(" ")) - wordCounts = words.countByValue() - for word, count in wordCounts.items(): - print(word, count) \ No newline at end of file + sc = SparkContext("local", "word count") + sc.setLogLevel("ERROR") + lines = sc.textFile("in/word_count.text") + words = lines.flatMap(lambda line: line.split(" ")) + wordCounts = words.countByValue() + for word, count in wordCounts.items(): + print(word, count) diff --git a/rdd/collect/CollectExample.py b/rdd/collect/CollectExample.py index 431aadfd..fe01119d 100644 --- a/rdd/collect/CollectExample.py +++ b/rdd/collect/CollectExample.py @@ -2,6 +2,7 @@ if __name__ == "__main__": sc = SparkContext("local", "collect") + sc.setLogLevel("ERROR") inputWords = ["spark", "hadoop", "spark", "hive", "pig", "cassandra", "hadoop"] wordRdd = sc.parallelize(inputWords) words = wordRdd.collect() diff --git a/rdd/count/CountExample.py b/rdd/count/CountExample.py index a9f35316..93bbf45e 100644 --- a/rdd/count/CountExample.py +++ b/rdd/count/CountExample.py @@ -2,10 +2,11 @@ if __name__ == "__main__": sc = SparkContext("local", "count") + sc.setLogLevel("ERROR") inputWords = ["spark", "hadoop", "spark", "hive", "pig", "cassandra", "hadoop"] wordRdd = sc.parallelize(inputWords) print("Count: {}".format(wordRdd.count())) worldCountByValue = wordRdd.countByValue() print("CountByValue: ") for word, count in worldCountByValue.items(): - print("{} : {}".format(word, count)) \ No newline at end of file + print("{} : {}".format(word, count)) diff --git a/rdd/nasaApacheWebLogs/UnionLogSolutions.py b/rdd/nasaApacheWebLogs/UnionLogSolutions.py index 067e2579..c69c0332 100644 --- a/rdd/nasaApacheWebLogs/UnionLogSolutions.py +++ b/rdd/nasaApacheWebLogs/UnionLogSolutions.py @@ -1,6 +1,6 @@ from pyspark import SparkContext -def isNotHeader(line:str): +def isNotHeader(line: str): return not (line.startswith("host") and "bytes" in line) if __name__ == "__main__": diff --git a/rdd/reduce/ReduceExample.py b/rdd/reduce/ReduceExample.py index 717e97d3..7eb8392a 100644 --- a/rdd/reduce/ReduceExample.py +++ b/rdd/reduce/ReduceExample.py @@ -2,6 +2,7 @@ if __name__ == "__main__": sc = SparkContext("local", "reduce") + sc.setLogLevel("ERROR") inputIntegers = [1, 2, 3, 4, 5] integerRdd = sc.parallelize(inputIntegers) product = integerRdd.reduce(lambda x, y: x * y) diff --git a/rdd/sumOfNumbers/SumOfNumbersProblem.py b/rdd/sumOfNumbers/SumOfNumbersProblem.py index 327b903e..72671962 100644 --- a/rdd/sumOfNumbers/SumOfNumbersProblem.py +++ b/rdd/sumOfNumbers/SumOfNumbersProblem.py @@ -1,11 +1,10 @@ - import sys from pyspark import SparkContext if __name__ == "__main__": - ''' + ''' Create a Spark program to read the first 100 prime numbers from in/prime_nums.text, print the sum of those numbers to console. Each row of the input file contains 10 prime numbers separated by spaces. - ''' \ No newline at end of file + ''' diff --git a/rdd/sumOfNumbers/SumOfNumbersSolution.py b/rdd/sumOfNumbers/SumOfNumbersSolution.py index 18bbb9e8..b0315c51 100644 --- a/rdd/sumOfNumbers/SumOfNumbersSolution.py +++ b/rdd/sumOfNumbers/SumOfNumbersSolution.py @@ -3,9 +3,10 @@ if __name__ == "__main__": sc = SparkContext("local", "primeNumbers") + sc.setLogLevel("ERROR") lines = sc.textFile("in/prime_nums.text") numbers = lines.flatMap(lambda line: line.split("\t")) validNumbers = numbers.filter(lambda number: number) intNumbers = validNumbers.map(lambda number: int(number)) print("Sum is: ") - print(intNumbers.reduce(lambda x, y: x + y)) \ No newline at end of file + print(intNumbers.reduce(lambda x, y: x + y)) diff --git a/rdd/take/TakeExample.py b/rdd/take/TakeExample.py index 6a91e063..fc73a781 100644 --- a/rdd/take/TakeExample.py +++ b/rdd/take/TakeExample.py @@ -2,9 +2,10 @@ from pyspark import SparkContext if __name__ == "__main__": - sc = SparkContext("local", "take") - inputWords = ["spark", "hadoop", "spark", "hive", "pig", "cassandra", "hadoop"] - wordRdd = sc.parallelize(inputWords) - words = wordRdd.take(3) - for word in words: - print(word) \ No newline at end of file + sc = SparkContext("local", "take") + sc.setLogLevel("ERROR") + inputWords = ["spark", "hadoop", "spark", "hive", "pig", "cassandra", "hadoop"] + wordRdd = sc.parallelize(inputWords) + words = wordRdd.take(3) + for word in words: + print(word) pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy