From 4f0a0147ff6234b420a3a4d447a7c068fa428fb7 Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Mon, 2 Oct 2017 12:36:54 +0200 Subject: [PATCH 01/10] Added pairRdd/sort/*.py --- pairRdd/sort/AverageHousePriceSolution.py | 23 +++++++++++++++++++++++ pairRdd/sort/SortedWordCountProblem.py | 16 ++++++++++++++++ pairRdd/sort/SortedWordCountSolution.py | 20 ++++++++++++++++++++ 3 files changed, 59 insertions(+) create mode 100644 pairRdd/sort/AverageHousePriceSolution.py create mode 100644 pairRdd/sort/SortedWordCountProblem.py create mode 100644 pairRdd/sort/SortedWordCountSolution.py diff --git a/pairRdd/sort/AverageHousePriceSolution.py b/pairRdd/sort/AverageHousePriceSolution.py new file mode 100644 index 00000000..4306fdbc --- /dev/null +++ b/pairRdd/sort/AverageHousePriceSolution.py @@ -0,0 +1,23 @@ +from pairRdd.aggregation.reducebykey.housePrice.AvgCount import AvgCount +from pyspark import SparkContext + + +if __name__ == "__main__": + + sc = SparkContext("local", "averageHousePriceSolution") + sc.setLogLevel("ERROR") + + lines = sc.textFile("in/RealEstate.csv") + cleanedLines = lines.filter(lambda line: "Bedrooms" not in line) + housePricePairRdd = cleanedLines.map(lambda line: \ + ((int(float(line.split(",")[3]))), AvgCount(1, float(line.split(",")[2])))) + + housePriceTotal = housePricePairRdd.reduceByKey(lambda x, y: \ + AvgCount(x.count + y.count, x.total + y.total)) + + housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount.total / avgCount.count) + + sortedHousePriceAvg = housePriceAvg.sortByKey() + + for bedrooms, avgPrice in sortedHousePriceAvg.collect(): + print("{} : {}".format(bedrooms, avgPrice)) diff --git a/pairRdd/sort/SortedWordCountProblem.py b/pairRdd/sort/SortedWordCountProblem.py new file mode 100644 index 00000000..bc7817b4 --- /dev/null +++ b/pairRdd/sort/SortedWordCountProblem.py @@ -0,0 +1,16 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + ''' + Create a Spark program to read the an article from in/word_count.text, + output the number of occurrence of each word in descending order. + + Sample output: + + apple : 200 + shoes : 193 + bag : 176 + ... + + ''' diff --git a/pairRdd/sort/SortedWordCountSolution.py b/pairRdd/sort/SortedWordCountSolution.py new file mode 100644 index 00000000..398c57ae --- /dev/null +++ b/pairRdd/sort/SortedWordCountSolution.py @@ -0,0 +1,20 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + sc = SparkContext("local", "wordCounts") + sc.setLogLevel("ERROR") + lines = sc.textFile("in/word_count.text") + wordRdd = lines.flatMap(lambda line: line.split(" ")) + + wordPairRdd = wordRdd.map(lambda word: (word, 1)) + wordToCountPairs = wordPairRdd.reduceByKey(lambda x, y: x + y) + + countToWordParis = wordToCountPairs.map(lambda wordToCount: (wordToCount[1], wordToCount[0])) + + sortedCountToWordParis = countToWordParis.sortByKey(ascending=False) + + sortedWordToCountPairs = sortedCountToWordParis.map(lambda countToWord: (countToWord[1], countToWord[0])) + + for word, count in sortedWordToCountPairs.collect(): + print("{} : {}".format(word, count)) From 56aa850dd0c94eab7052d278a2c183746dce5a16 Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Mon, 2 Oct 2017 12:45:11 +0200 Subject: [PATCH 02/10] Added pairRdd/mapValues/*.py --- pairRdd/mapValues/AirportsUppercaseProblem.py | 24 +++++++++++++++++++ .../mapValues/AirportsUppercaseSolution.py | 17 +++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 pairRdd/mapValues/AirportsUppercaseProblem.py create mode 100644 pairRdd/mapValues/AirportsUppercaseSolution.py diff --git a/pairRdd/mapValues/AirportsUppercaseProblem.py b/pairRdd/mapValues/AirportsUppercaseProblem.py new file mode 100644 index 00000000..124b37f0 --- /dev/null +++ b/pairRdd/mapValues/AirportsUppercaseProblem.py @@ -0,0 +1,24 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + ''' + Create a Spark program to read the airport data from in/airports.text, generate a pair RDD with airport name + being the key and country name being the value. Then convert the country name to uppercase and + output the pair RDD to out/airports_uppercase.text + + Each row of the input file contains the following columns: + + Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code, + ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format + + Sample output: + + ("Kamloops", "CANADA") + ("Wewak Intl", "PAPUA NEW GUINEA") + ... + + ''' + + + \ No newline at end of file diff --git a/pairRdd/mapValues/AirportsUppercaseSolution.py b/pairRdd/mapValues/AirportsUppercaseSolution.py new file mode 100644 index 00000000..3c9fa201 --- /dev/null +++ b/pairRdd/mapValues/AirportsUppercaseSolution.py @@ -0,0 +1,17 @@ +from pyspark import SparkContext +from commons.Utils import Utils + +if __name__ == "__main__": + + sc = SparkContext("local", "airports") + sc.setLogLevel("ERROR") + + airportsRDD = sc.textFile("in/airports.text") + + airportPairRDD = airportsRDD.map(lambda line: \ + (Utils.COMMA_DELIMITER.split(line)[1], \ + Utils.COMMA_DELIMITER.split(line)[3])) + + upperCase = airportPairRDD.mapValues(lambda countryName: countryName.upper()) + + upperCase.saveAsTextFile("out/airports_uppercase.text") From 244fe9b11d5ba3ddb8d0d72d90501bef5093e9ce Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Mon, 2 Oct 2017 12:49:54 +0200 Subject: [PATCH 03/10] Added pairRdd/join/JoinOperations.py --- pairRdd/join/JoinOperations.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 pairRdd/join/JoinOperations.py diff --git a/pairRdd/join/JoinOperations.py b/pairRdd/join/JoinOperations.py new file mode 100644 index 00000000..250f9007 --- /dev/null +++ b/pairRdd/join/JoinOperations.py @@ -0,0 +1,21 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + sc = SparkContext("local", "JoinOperations") + sc.setLogLevel("ERROR") + + ages = sc.parallelize([("Tom", 29), ("John", 22)]) + addresses = sc.parallelize([("James", "USA"), ("John", "UK")]) + + join = ages.join(addresses) + join.saveAsTextFile("out/age_address_join.text") + + leftOuterJoin = ages.leftOuterJoin(addresses) + leftOuterJoin.saveAsTextFile("out/age_address_left_out_join.text") + + rightOuterJoin = ages.rightOuterJoin(addresses) + rightOuterJoin.saveAsTextFile("out/age_address_right_out_join.text") + + fullOuterJoin = ages.fullOuterJoin(addresses) + fullOuterJoin.saveAsTextFile("out/age_address_full_out_join.text") From d6b58da2436d979a76bc0ed9ffa9027f65a2938f Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Mon, 2 Oct 2017 12:57:12 +0200 Subject: [PATCH 04/10] Added pairRdd/groupbykey/*.py --- .../groupbykey/AirportsByCountryProblem.py | 23 +++++++++++++++++++ .../groupbykey/AirportsByCountrySolution.py | 18 +++++++++++++++ pairRdd/groupbykey/GroupByKeyVsReduceByKey.py | 18 +++++++++++++++ 3 files changed, 59 insertions(+) create mode 100644 pairRdd/groupbykey/AirportsByCountryProblem.py create mode 100644 pairRdd/groupbykey/AirportsByCountrySolution.py create mode 100644 pairRdd/groupbykey/GroupByKeyVsReduceByKey.py diff --git a/pairRdd/groupbykey/AirportsByCountryProblem.py b/pairRdd/groupbykey/AirportsByCountryProblem.py new file mode 100644 index 00000000..6e4b1e24 --- /dev/null +++ b/pairRdd/groupbykey/AirportsByCountryProblem.py @@ -0,0 +1,23 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + ''' + Create a Spark program to read the airport data from in/airports.text, + output the the list of the names of the airports located in each country. + + Each row of the input file contains the following columns: + Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code, + ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format + + Sample output: + + "Canada", ["Bagotville", "Montreal", "Coronation", ...] + "Norway" : ["Vigra", "Andenes", "Alta", "Bomoen", "Bronnoy",..] + "Papua New Guinea", ["Goroka", "Madang", ...] + ... + + ''' + + + \ No newline at end of file diff --git a/pairRdd/groupbykey/AirportsByCountrySolution.py b/pairRdd/groupbykey/AirportsByCountrySolution.py new file mode 100644 index 00000000..0cc7017e --- /dev/null +++ b/pairRdd/groupbykey/AirportsByCountrySolution.py @@ -0,0 +1,18 @@ +from pyspark import SparkContext +from commons.Utils import Utils + +if __name__ == "__main__": + + sc = SparkContext("local", "airports") + sc.setLogLevel("ERROR") + + lines = sc.textFile("in/airports.text") + + countryAndAirportNameAndPair = lines.map(lambda airport:\ + (Utils.COMMA_DELIMITER.split(airport)[3], + Utils.COMMA_DELIMITER.split(airport)[1])) + + airportsByCountry = countryAndAirportNameAndPair.groupByKey() + + for country, airportName in airportsByCountry.collectAsMap().items(): + print("{}: {}".format(country,list(airportName))) diff --git a/pairRdd/groupbykey/GroupByKeyVsReduceByKey.py b/pairRdd/groupbykey/GroupByKeyVsReduceByKey.py new file mode 100644 index 00000000..99eb96df --- /dev/null +++ b/pairRdd/groupbykey/GroupByKeyVsReduceByKey.py @@ -0,0 +1,18 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + sc = SparkContext("local", "GroupByKeyVsReduceByKey") + sc.setLogLevel("ERROR") + + words = ["one", "two", "two", "three", "three", "three"] + wordsPairRdd = sc.parallelize(words).map(lambda word: (word, 1)) + + wordCountsWithReduceByKey = wordsPairRdd.reduceByKey(lambda x, y: x + y).collect() + print("wordCountsWithReduceByKey: {}".format(list(wordCountsWithReduceByKey))) + + wordCountsWithGroupByKey = wordsPairRdd \ + .groupByKey() \ + .mapValues(lambda intIterable: len(intIterable)) \ + .collect() + print("wordCountsWithGroupByKey: {}".format(list(wordCountsWithGroupByKey))) From a2068e706d028a70d43bd52f76a464c0a6644e2b Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Mon, 2 Oct 2017 13:00:32 +0200 Subject: [PATCH 05/10] Added pairRdd/filter/*.py --- pairRdd/filter/AirportsNotInUsaProblem.py | 20 ++++++++++++++++++++ pairRdd/filter/AirportsNotInUsaSolution.py | 16 ++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 pairRdd/filter/AirportsNotInUsaProblem.py create mode 100644 pairRdd/filter/AirportsNotInUsaSolution.py diff --git a/pairRdd/filter/AirportsNotInUsaProblem.py b/pairRdd/filter/AirportsNotInUsaProblem.py new file mode 100644 index 00000000..d0e2da3a --- /dev/null +++ b/pairRdd/filter/AirportsNotInUsaProblem.py @@ -0,0 +1,20 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + ''' + Create a Spark program to read the airport data from in/airports.text; + generate a pair RDD with airport name being the key and country name being the value. + Then remove all the airports which are located in United States and output the pair RDD to out/airports_not_in_usa_pair_rdd.text + + Each row of the input file contains the following columns: + Airport ID, Name of airport, Main city served by airport, Country where airport is located, + IATA/FAA code, ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format + + Sample output: + + ("Kamloops", "Canada") + ("Wewak Intl", "Papua New Guinea") + ... + + ''' diff --git a/pairRdd/filter/AirportsNotInUsaSolution.py b/pairRdd/filter/AirportsNotInUsaSolution.py new file mode 100644 index 00000000..fd9c2fe9 --- /dev/null +++ b/pairRdd/filter/AirportsNotInUsaSolution.py @@ -0,0 +1,16 @@ +from pyspark import SparkContext +from commons.Utils import Utils + +if __name__ == "__main__": + + sc = SparkContext("local", "airports") + sc.setLogLevel("ERROR") + + airportsRDD = sc.textFile("in/airports.text") + + airportPairRDD = airportsRDD.map(lambda line: \ + (Utils.COMMA_DELIMITER.split(line)[1], + Utils.COMMA_DELIMITER.split(line)[3])) + airportsNotInUSA = airportPairRDD.filter(lambda keyValue: keyValue[1] != "\"United States\"") + + airportsNotInUSA.saveAsTextFile("out/airports_not_in_usa_pair_rdd.text") From c6a3a82cdc24f6908cc6de7cad2cb8600216a085 Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Mon, 2 Oct 2017 13:03:24 +0200 Subject: [PATCH 06/10] Added pairRdd/create/*.py --- pairRdd/create/PairRddFromRegularRdd.py | 12 ++++++++++++ pairRdd/create/PairRddFromTupleList.py | 11 +++++++++++ 2 files changed, 23 insertions(+) create mode 100644 pairRdd/create/PairRddFromRegularRdd.py create mode 100644 pairRdd/create/PairRddFromTupleList.py diff --git a/pairRdd/create/PairRddFromRegularRdd.py b/pairRdd/create/PairRddFromRegularRdd.py new file mode 100644 index 00000000..bfd6f187 --- /dev/null +++ b/pairRdd/create/PairRddFromRegularRdd.py @@ -0,0 +1,12 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + sc = SparkContext("local", "create") + sc.setLogLevel("ERROR") + + inputStrings = ["Lily 23", "Jack 29", "Mary 29", "James 8"] + regularRDDs = sc.parallelize(inputStrings) + + pairRDD = regularRDDs.map(lambda s: (s.split(" ")[0], s.split(" ")[1])) + pairRDD.coalesce(1).saveAsTextFile("out/pair_rdd_from_regular_rdd") diff --git a/pairRdd/create/PairRddFromTupleList.py b/pairRdd/create/PairRddFromTupleList.py new file mode 100644 index 00000000..c728d005 --- /dev/null +++ b/pairRdd/create/PairRddFromTupleList.py @@ -0,0 +1,11 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + sc = SparkContext("local", "create") + sc.setLogLevel("ERROR") + + tuples = [("Lily", 23), ("Jack", 29), ("Mary", 29), ("James", 8)] + pairRDD = sc.parallelize(tuples) + + pairRDD.coalesce(1).saveAsTextFile("out/pair_rdd_from_tuple_list") From 7bcccc9aa2fc88d2742795120ccd2eb5935f421f Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Mon, 2 Oct 2017 13:13:59 +0200 Subject: [PATCH 07/10] Added pairRdd/aggregation/combinebykey/AverageHousePriceSolution.py --- .../combinebykey/AverageHousePriceSolution.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 pairRdd/aggregation/combinebykey/AverageHousePriceSolution.py diff --git a/pairRdd/aggregation/combinebykey/AverageHousePriceSolution.py b/pairRdd/aggregation/combinebykey/AverageHousePriceSolution.py new file mode 100644 index 00000000..4885ccbe --- /dev/null +++ b/pairRdd/aggregation/combinebykey/AverageHousePriceSolution.py @@ -0,0 +1,21 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + sc = SparkContext("local", "AverageHousePrice") + sc.setLogLevel("ERROR") + + lines = sc.textFile("in/RealEstate.csv") + cleanedLines = lines.filter(lambda line: "Bedrooms" not in line) + + housePricePairRdd = cleanedLines.map(lambda line: (line.split(",")[3], float(line.split(",")[2]))) + + createCombiner = lambda x: (1, x) + mergeValue = lambda avgCount, x: (avgCount[0] + 1, avgCount[1] + x) + mergeCombiners = lambda avgCountA, avgCountB: (avgCountA[0] + avgCountB[0], avgCountA[1] + avgCountB[1]) + + housePriceTotal = housePricePairRdd.combineByKey(createCombiner, mergeValue, mergeCombiners) + + housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount[1] / avgCount[0]) + for bedrooms, avgPrice in housePriceAvg.collect(): + print("{} : {}".format(bedrooms, avgPrice)) From 43f7883c03722e16c5b3fa0809f574240613862c Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Mon, 2 Oct 2017 13:16:18 +0200 Subject: [PATCH 08/10] Added pairRdd/aggregation/reducebykey/WordCount.py --- pairRdd/aggregation/reducebykey/WordCount.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 pairRdd/aggregation/reducebykey/WordCount.py diff --git a/pairRdd/aggregation/reducebykey/WordCount.py b/pairRdd/aggregation/reducebykey/WordCount.py new file mode 100644 index 00000000..3a00f380 --- /dev/null +++ b/pairRdd/aggregation/reducebykey/WordCount.py @@ -0,0 +1,14 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + sc = SparkContext("local", "wordCounts") + sc.setLogLevel("ERROR") + + lines = sc.textFile("in/word_count.text") + wordRdd = lines.flatMap(lambda line: line.split(" ")) + wordPairRdd = wordRdd.map(lambda word: (word, 1)) + + wordCounts = wordPairRdd.reduceByKey(lambda x, y: x + y) + for word, count in wordCounts.collect(): + print("{} : {}".format(word, count)) From fb3dfcbf7b00b5e7026c9b8aa8fd997d92d79d58 Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Mon, 2 Oct 2017 13:19:35 +0200 Subject: [PATCH 09/10] Added pairRdd/aggregation/reducebykey/housePrice/*.py --- .../housePrice/AverageHousePriceProblem.py | 35 +++++++++++++++++++ .../housePrice/AverageHousePriceSolution.py | 24 +++++++++++++ .../reducebykey/housePrice/AvgCount.py | 7 ++++ 3 files changed, 66 insertions(+) create mode 100644 pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.py create mode 100644 pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.py create mode 100644 pairRdd/aggregation/reducebykey/housePrice/AvgCount.py diff --git a/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.py b/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.py new file mode 100644 index 00000000..caf05e89 --- /dev/null +++ b/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.py @@ -0,0 +1,35 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + ''' + Create a Spark program to read the house data from in/RealEstate.csv, + output the average price for houses with different number of bedrooms. + + The houses dataset contains a collection of recent real estate listings in San Luis Obispo county and + around it.  + + The dataset contains the following fields: + 1. MLS: Multiple listing service number for the house (unique ID). + 2. Location: city/town where the house is located. Most locations are in San Luis Obispo county and + northern Santa Barbara county (Santa Maria­Orcutt, Lompoc, Guadelupe, Los Alamos), but there + some out of area locations as well. + 3. Price: the most recent listing price of the house (in dollars). + 4. Bedrooms: number of bedrooms. + 5. Bathrooms: number of bathrooms. + 6. Size: size of the house in square feet. + 7. Price/SQ.ft: price of the house per square foot. + 8. Status: type of sale. Thee types are represented in the dataset: Short Sale, Foreclosure and Regular. + + Each field is comma separated. + + Sample output: + + (3, 325000) + (1, 266356) + (2, 325000) + ... + + 3, 1 and 2 mean the number of bedrooms. 325000 means the average price of houses with 3 bedrooms is 325000. + + ''' diff --git a/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.py b/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.py new file mode 100644 index 00000000..acb633e2 --- /dev/null +++ b/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.py @@ -0,0 +1,24 @@ +from pyspark import SparkContext + +if __name__ == "__main__": + + sc = SparkContext("local", "avgHousePrice") + sc.setLogLevel("ERROR") + + lines = sc.textFile("in/RealEstate.csv") + cleanedLines = lines.filter(lambda line: "Bedrooms" not in line) + + housePricePairRdd = cleanedLines.map(lambda line: \ + (line.split(",")[3], (1, float(line.split(",")[2])))) + + housePriceTotal = housePricePairRdd \ + .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) + + print("housePriceTotal: ") + for bedroom, total in housePriceTotal.collect(): + print("{} : {}".format(bedroom, total)) + + housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount[1] / avgCount[0]) + print("\nhousePriceAvg: ") + for bedroom, avg in housePriceAvg.collect(): + print("{} : {}".format(bedroom, avg)) diff --git a/pairRdd/aggregation/reducebykey/housePrice/AvgCount.py b/pairRdd/aggregation/reducebykey/housePrice/AvgCount.py new file mode 100644 index 00000000..37a59ede --- /dev/null +++ b/pairRdd/aggregation/reducebykey/housePrice/AvgCount.py @@ -0,0 +1,7 @@ +class AvgCount(): + + def __init__(self, count: int, total: float): + self.count = count + self.total = total + + From ee71addec096eea8c6f0c81c744bb596b71e811d Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Mon, 2 Oct 2017 13:20:38 +0200 Subject: [PATCH 10/10] Finished pairRdd folder | Removed scala files from folder pairRdd --- .../AverageHousePriceSolution.scala | 30 --------------- .../aggregation/reducebykey/WordCount.scala | 20 ---------- .../housePrice/AverageHousePriceProblem.scala | 38 ------------------- .../AverageHousePriceSolution.scala | 27 ------------- .../reducebykey/housePrice/AvgCount.scala | 4 -- pairRdd/create/PairRddFromRegularRdd.scala | 18 --------- pairRdd/create/PairRddFromTupleList.scala | 17 --------- pairRdd/filter/AirportsNotInUsaProblem.scala | 22 ----------- pairRdd/filter/AirportsNotInUsaSolution.scala | 21 ---------- .../groupbykey/AirportsByCountryProblem.scala | 22 ----------- .../AirportsByCountrySolution.scala | 23 ----------- .../groupbykey/GroupByKeyVsReduceByKey.scala | 23 ----------- pairRdd/join/JoinOperations.scala | 27 ------------- .../mapValues/AirportsUppercaseProblem.scala | 23 ----------- .../mapValues/AirportsUppercaseSolution.scala | 22 ----------- pairRdd/sort/AverageHousePriceSolution.scala | 28 -------------- pairRdd/sort/SortedWordCountProblem.scala | 17 --------- pairRdd/sort/SortedWordCountSolution.scala | 28 -------------- 18 files changed, 410 deletions(-) delete mode 100644 pairRdd/aggregation/combinebykey/AverageHousePriceSolution.scala delete mode 100644 pairRdd/aggregation/reducebykey/WordCount.scala delete mode 100644 pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.scala delete mode 100644 pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.scala delete mode 100644 pairRdd/aggregation/reducebykey/housePrice/AvgCount.scala delete mode 100644 pairRdd/create/PairRddFromRegularRdd.scala delete mode 100644 pairRdd/create/PairRddFromTupleList.scala delete mode 100644 pairRdd/filter/AirportsNotInUsaProblem.scala delete mode 100644 pairRdd/filter/AirportsNotInUsaSolution.scala delete mode 100644 pairRdd/groupbykey/AirportsByCountryProblem.scala delete mode 100644 pairRdd/groupbykey/AirportsByCountrySolution.scala delete mode 100644 pairRdd/groupbykey/GroupByKeyVsReduceByKey.scala delete mode 100644 pairRdd/join/JoinOperations.scala delete mode 100644 pairRdd/mapValues/AirportsUppercaseProblem.scala delete mode 100644 pairRdd/mapValues/AirportsUppercaseSolution.scala delete mode 100644 pairRdd/sort/AverageHousePriceSolution.scala delete mode 100644 pairRdd/sort/SortedWordCountProblem.scala delete mode 100644 pairRdd/sort/SortedWordCountSolution.scala diff --git a/pairRdd/aggregation/combinebykey/AverageHousePriceSolution.scala b/pairRdd/aggregation/combinebykey/AverageHousePriceSolution.scala deleted file mode 100644 index e744546f..00000000 --- a/pairRdd/aggregation/combinebykey/AverageHousePriceSolution.scala +++ /dev/null @@ -1,30 +0,0 @@ -package com.sparkTutorial.pairRdd.aggregation.combinebykey - -import org.apache.log4j.{Level, Logger} -import org.apache.spark.{SparkConf, SparkContext} - -object AverageHousePriceSolution { - - def main(args: Array[String]) { - - Logger.getLogger("org").setLevel(Level.ERROR) - val conf = new SparkConf().setAppName("wordCounts").setMaster("local[3]") - val sc = new SparkContext(conf) - - val lines = sc.textFile("in/RealEstate.csv") - val cleanedLines = lines.filter(line => !line.contains("Bedrooms")) - - val housePricePairRdd = cleanedLines.map(line => (line.split(",")(3), line.split(",")(2).toDouble)) - - val createCombiner = (x: Double) => (1, x) - val mergeValue = (avgCount: AvgCount, x: Double) => (avgCount._1 + 1, avgCount._2 + x) - val mergeCombiners = (avgCountA: AvgCount, avgCountB: AvgCount) => (avgCountA._1 + avgCountB._1, avgCountA._2 + avgCountB._2) - - val housePriceTotal = housePricePairRdd.combineByKey(createCombiner, mergeValue, mergeCombiners) - - val housePriceAvg = housePriceTotal.mapValues(avgCount => avgCount._2 / avgCount._1) - for ((bedrooms, avgPrice) <- housePriceAvg.collect()) println(bedrooms + " : " + avgPrice) - } - - type AvgCount = (Int, Double) -} diff --git a/pairRdd/aggregation/reducebykey/WordCount.scala b/pairRdd/aggregation/reducebykey/WordCount.scala deleted file mode 100644 index 89d51803..00000000 --- a/pairRdd/aggregation/reducebykey/WordCount.scala +++ /dev/null @@ -1,20 +0,0 @@ -package com.sparkTutorial.pairRdd.aggregation.reducebykey - -import org.apache.log4j.{Level, Logger} -import org.apache.spark.{SparkConf, SparkContext} - -object WordCount { - - def main(args: Array[String]) { - Logger.getLogger("org").setLevel(Level.ERROR) - val conf = new SparkConf().setAppName("wordCounts").setMaster("local[3]") - val sc = new SparkContext(conf) - - val lines = sc.textFile("in/word_count.text") - val wordRdd = lines.flatMap(line => line.split(" ")) - val wordPairRdd = wordRdd.map(word => (word, 1)) - - val wordCounts = wordPairRdd.reduceByKey((x, y) => x + y) - for ((word, count) <- wordCounts.collect()) println(word + " : " + count) - } -} diff --git a/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.scala b/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.scala deleted file mode 100644 index 1d433a6b..00000000 --- a/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.scala +++ /dev/null @@ -1,38 +0,0 @@ -package com.sparkTutorial.pairRdd.aggregation.reducebykey.housePrice - -object AverageHousePriceProblem { - - def main(args: Array[String]) { - - /* Create a Spark program to read the house data from in/RealEstate.csv, - output the average price for houses with different number of bedrooms. - - The houses dataset contains a collection of recent real estate listings in San Luis Obispo county and - around it.  - - The dataset contains the following fields: - 1. MLS: Multiple listing service number for the house (unique ID). - 2. Location: city/town where the house is located. Most locations are in San Luis Obispo county and - northern Santa Barbara county (Santa Maria­Orcutt, Lompoc, Guadelupe, Los Alamos), but there - some out of area locations as well. - 3. Price: the most recent listing price of the house (in dollars). - 4. Bedrooms: number of bedrooms. - 5. Bathrooms: number of bathrooms. - 6. Size: size of the house in square feet. - 7. Price/SQ.ft: price of the house per square foot. - 8. Status: type of sale. Thee types are represented in the dataset: Short Sale, Foreclosure and Regular. - - Each field is comma separated. - - Sample output: - - (3, 325000) - (1, 266356) - (2, 325000) - ... - - 3, 1 and 2 mean the number of bedrooms. 325000 means the average price of houses with 3 bedrooms is 325000. - */ - } - -} diff --git a/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.scala b/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.scala deleted file mode 100644 index 29cf8b38..00000000 --- a/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.scala +++ /dev/null @@ -1,27 +0,0 @@ -package com.sparkTutorial.pairRdd.aggregation.reducebykey.housePrice - -import org.apache.log4j.{Level, Logger} -import org.apache.spark.{SparkConf, SparkContext} - -object AverageHousePriceSolution { - - def main(args: Array[String]) { - Logger.getLogger("org").setLevel(Level.ERROR) - val conf = new SparkConf().setAppName("avgHousePrice").setMaster("local[3]") - val sc = new SparkContext(conf) - - val lines = sc.textFile("in/RealEstate.csv") - val cleanedLines = lines.filter(line => !line.contains("Bedrooms")) - - val housePricePairRdd = cleanedLines.map(line => (line.split(",")(3), (1, line.split(",")(2).toDouble))) - - val housePriceTotal = housePricePairRdd.reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2)) - - println("housePriceTotal: ") - for ((bedroom, total) <- housePriceTotal.collect()) println(bedroom + " : " + total) - - val housePriceAvg = housePriceTotal.mapValues(avgCount => avgCount._2 / avgCount._1) - println("housePriceAvg: ") - for ((bedroom, avg) <- housePriceAvg.collect()) println(bedroom + " : " + avg) - } -} diff --git a/pairRdd/aggregation/reducebykey/housePrice/AvgCount.scala b/pairRdd/aggregation/reducebykey/housePrice/AvgCount.scala deleted file mode 100644 index 8eb0ea9a..00000000 --- a/pairRdd/aggregation/reducebykey/housePrice/AvgCount.scala +++ /dev/null @@ -1,4 +0,0 @@ -package com.sparkTutorial.pairRdd.aggregation.reducebykey.housePrice - -case class AvgCount(count: Int, total: Double) - diff --git a/pairRdd/create/PairRddFromRegularRdd.scala b/pairRdd/create/PairRddFromRegularRdd.scala deleted file mode 100644 index 5fa8ec05..00000000 --- a/pairRdd/create/PairRddFromRegularRdd.scala +++ /dev/null @@ -1,18 +0,0 @@ -package com.sparkTutorial.pairRdd.create - -import org.apache.spark.{SparkConf, SparkContext} - -object PairRddFromRegularRdd { - - def main(args: Array[String]) { - - val conf = new SparkConf().setAppName("create").setMaster("local[1]") - val sc = new SparkContext(conf) - - val inputStrings = List("Lily 23", "Jack 29", "Mary 29", "James 8") - val regularRDDs = sc.parallelize(inputStrings) - - val pairRDD = regularRDDs.map(s => (s.split(" ")(0), s.split(" ")(1))) - pairRDD.coalesce(1).saveAsTextFile("out/pair_rdd_from_regular_rdd") - } -} diff --git a/pairRdd/create/PairRddFromTupleList.scala b/pairRdd/create/PairRddFromTupleList.scala deleted file mode 100644 index 3b9a7632..00000000 --- a/pairRdd/create/PairRddFromTupleList.scala +++ /dev/null @@ -1,17 +0,0 @@ -package com.sparkTutorial.pairRdd.create - -import org.apache.spark.{SparkConf, SparkContext} - -object PairRddFromTupleList { - - def main(args: Array[String]) { - - val conf = new SparkConf().setAppName("create").setMaster("local[1]") - val sc = new SparkContext(conf) - - val tuple = List(("Lily", 23), ("Jack", 29), ("Mary", 29), ("James", 8)) - val pairRDD = sc.parallelize(tuple) - - pairRDD.coalesce(1).saveAsTextFile("out/pair_rdd_from_tuple_list") - } -} diff --git a/pairRdd/filter/AirportsNotInUsaProblem.scala b/pairRdd/filter/AirportsNotInUsaProblem.scala deleted file mode 100644 index b1613b53..00000000 --- a/pairRdd/filter/AirportsNotInUsaProblem.scala +++ /dev/null @@ -1,22 +0,0 @@ -package com.sparkTutorial.pairRdd.filter - -object AirportsNotInUsaProblem { - - def main(args: Array[String]) { - - /* Create a Spark program to read the airport data from in/airports.text; - generate a pair RDD with airport name being the key and country name being the value. - Then remove all the airports which are located in United States and output the pair RDD to out/airports_not_in_usa_pair_rdd.text - - Each row of the input file contains the following columns: - Airport ID, Name of airport, Main city served by airport, Country where airport is located, - IATA/FAA code, ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format - - Sample output: - - ("Kamloops", "Canada") - ("Wewak Intl", "Papua New Guinea") - ... - */ - } -} diff --git a/pairRdd/filter/AirportsNotInUsaSolution.scala b/pairRdd/filter/AirportsNotInUsaSolution.scala deleted file mode 100644 index b014ffec..00000000 --- a/pairRdd/filter/AirportsNotInUsaSolution.scala +++ /dev/null @@ -1,21 +0,0 @@ -package com.sparkTutorial.pairRdd.filter - -import com.sparkTutorial.commons.Utils -import org.apache.spark.{SparkConf, SparkContext} - -object AirportsNotInUsaSolution { - - def main(args: Array[String]) { - - val conf = new SparkConf().setAppName("airports").setMaster("local") - val sc = new SparkContext(conf) - - val airportsRDD = sc.textFile("in/airports.text") - - val airportPairRDD = airportsRDD.map(line => (line.split(Utils.COMMA_DELIMITER)(1), - line.split(Utils.COMMA_DELIMITER)(3))) - val airportsNotInUSA = airportPairRDD.filter(keyValue => keyValue._2 != "\"United States\"") - - airportsNotInUSA.saveAsTextFile("out/airports_not_in_usa_pair_rdd.text") - } -} diff --git a/pairRdd/groupbykey/AirportsByCountryProblem.scala b/pairRdd/groupbykey/AirportsByCountryProblem.scala deleted file mode 100644 index 5bfd7f3d..00000000 --- a/pairRdd/groupbykey/AirportsByCountryProblem.scala +++ /dev/null @@ -1,22 +0,0 @@ -package com.sparkTutorial.pairRdd.groupbykey - -object AirportsByCountryProblem { - - def main(args: Array[String]) { - - /* Create a Spark program to read the airport data from in/airports.text, - output the the list of the names of the airports located in each country. - - Each row of the input file contains the following columns: - Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code, - ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format - - Sample output: - - "Canada", List("Bagotville", "Montreal", "Coronation", ...) - "Norway" : List("Vigra", "Andenes", "Alta", "Bomoen", "Bronnoy",..) - "Papua New Guinea", List("Goroka", "Madang", ...) - ... - */ - } -} diff --git a/pairRdd/groupbykey/AirportsByCountrySolution.scala b/pairRdd/groupbykey/AirportsByCountrySolution.scala deleted file mode 100644 index 09aa25fa..00000000 --- a/pairRdd/groupbykey/AirportsByCountrySolution.scala +++ /dev/null @@ -1,23 +0,0 @@ -package com.sparkTutorial.pairRdd.groupbykey - -import com.sparkTutorial.commons.Utils -import org.apache.log4j.{Level, Logger} -import org.apache.spark.{SparkConf, SparkContext} - -object AirportsByCountrySolution { - - def main(args: Array[String]) { - Logger.getLogger("org").setLevel(Level.ERROR) - val conf = new SparkConf().setAppName("airports").setMaster("local[*]") - val sc = new SparkContext(conf) - - val lines = sc.textFile("in/airports.text") - - val countryAndAirportNameAndPair = lines.map(airport => (airport.split(Utils.COMMA_DELIMITER)(3), - airport.split(Utils.COMMA_DELIMITER)(1))) - - val airportsByCountry = countryAndAirportNameAndPair.groupByKey() - - for ((country, airportName) <- airportsByCountry.collectAsMap()) println(country + ": " + airportName.toList) - } -} diff --git a/pairRdd/groupbykey/GroupByKeyVsReduceByKey.scala b/pairRdd/groupbykey/GroupByKeyVsReduceByKey.scala deleted file mode 100644 index 8ae2f601..00000000 --- a/pairRdd/groupbykey/GroupByKeyVsReduceByKey.scala +++ /dev/null @@ -1,23 +0,0 @@ -package com.sparkTutorial.pairRdd.groupbykey - -import org.apache.log4j.{Level, Logger} -import org.apache.spark.{SparkConf, SparkContext} - -object GroupByKeyVsReduceByKey { - - def main(args: Array[String]) { - Logger.getLogger("org").setLevel(Level.ERROR) - val conf = new SparkConf().setAppName("GroupByKeyVsReduceByKey").setMaster("local[*]") - val sc = new SparkContext(conf) - - val words = List("one", "two", "two", "three", "three", "three") - val wordsPairRdd = sc.parallelize(words).map(word => (word, 1)) - - val wordCountsWithReduceByKey = wordsPairRdd.reduceByKey((x, y) => x + y).collect() - println("wordCountsWithReduceByKey: " + wordCountsWithReduceByKey.toList) - - val wordCountsWithGroupByKey = wordsPairRdd.groupByKey().mapValues(intIterable => intIterable.size).collect() - println("wordCountsWithGroupByKey: " + wordCountsWithGroupByKey.toList) - } -} - diff --git a/pairRdd/join/JoinOperations.scala b/pairRdd/join/JoinOperations.scala deleted file mode 100644 index 549a5745..00000000 --- a/pairRdd/join/JoinOperations.scala +++ /dev/null @@ -1,27 +0,0 @@ -package com.sparkTutorial.pairRdd.join - -import org.apache.spark.{SparkConf, SparkContext} - -object JoinOperations { - - def main(args: Array[String]) { - - val conf = new SparkConf().setAppName("JoinOperations").setMaster("local[1]") - val sc = new SparkContext(conf) - - val ages = sc.parallelize(List(("Tom", 29),("John", 22))) - val addresses = sc.parallelize(List(("James", "USA"), ("John", "UK"))) - - val join = ages.join(addresses) - join.saveAsTextFile("out/age_address_join.text") - - val leftOuterJoin = ages.leftOuterJoin(addresses) - leftOuterJoin.saveAsTextFile("out/age_address_left_out_join.text") - - val rightOuterJoin = ages.rightOuterJoin(addresses) - rightOuterJoin.saveAsTextFile("out/age_address_right_out_join.text") - - val fullOuterJoin = ages.fullOuterJoin(addresses) - fullOuterJoin.saveAsTextFile("out/age_address_full_out_join.text") - } -} diff --git a/pairRdd/mapValues/AirportsUppercaseProblem.scala b/pairRdd/mapValues/AirportsUppercaseProblem.scala deleted file mode 100644 index ac9dac2f..00000000 --- a/pairRdd/mapValues/AirportsUppercaseProblem.scala +++ /dev/null @@ -1,23 +0,0 @@ -package com.sparkTutorial.pairRdd.mapValues - -object AirportsUppercaseProblem { - - def main(args: Array[String]) { - - /* Create a Spark program to read the airport data from in/airports.text, generate a pair RDD with airport name - being the key and country name being the value. Then convert the country name to uppercase and - output the pair RDD to out/airports_uppercase.text - - Each row of the input file contains the following columns: - - Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code, - ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format - - Sample output: - - ("Kamloops", "CANADA") - ("Wewak Intl", "PAPUA NEW GUINEA") - ... - */ - } -} diff --git a/pairRdd/mapValues/AirportsUppercaseSolution.scala b/pairRdd/mapValues/AirportsUppercaseSolution.scala deleted file mode 100644 index 093fb193..00000000 --- a/pairRdd/mapValues/AirportsUppercaseSolution.scala +++ /dev/null @@ -1,22 +0,0 @@ -package com.sparkTutorial.pairRdd.mapValues - -import com.sparkTutorial.commons.Utils -import org.apache.spark.{SparkConf, SparkContext} - -object AirportsUppercaseSolution { - - def main(args: Array[String]) { - - val conf = new SparkConf().setAppName("airports").setMaster("local") - val sc = new SparkContext(conf) - - val airportsRDD = sc.textFile("in/airports.text") - - val airportPairRDD = airportsRDD.map((line: String) => (line.split(Utils.COMMA_DELIMITER)(1), - line.split(Utils.COMMA_DELIMITER)(3))) - - val upperCase = airportPairRDD.mapValues(countryName => countryName.toUpperCase) - - upperCase.saveAsTextFile("out/airports_uppercase.text") - } -} diff --git a/pairRdd/sort/AverageHousePriceSolution.scala b/pairRdd/sort/AverageHousePriceSolution.scala deleted file mode 100644 index 56eb591b..00000000 --- a/pairRdd/sort/AverageHousePriceSolution.scala +++ /dev/null @@ -1,28 +0,0 @@ -package com.sparkTutorial.pairRdd.sort - -import com.sparkTutorial.pairRdd.aggregation.reducebykey.housePrice.AvgCount -import org.apache.log4j.{Level, Logger} -import org.apache.spark.{SparkConf, SparkContext} - -object AverageHousePriceSolution { - - def main(args: Array[String]) { - Logger.getLogger("org").setLevel(Level.ERROR) - val conf = new SparkConf().setAppName("averageHousePriceSolution").setMaster("local[3]") - val sc = new SparkContext(conf) - - val lines = sc.textFile("in/RealEstate.csv") - val cleanedLines = lines.filter(line => !line.contains("Bedrooms")) - val housePricePairRdd = cleanedLines.map( - line => (line.split(",")(3).toInt, AvgCount(1, line.split(",")(2).toDouble))) - - val housePriceTotal = housePricePairRdd.reduceByKey((x, y) => AvgCount(x.count + y.count, x.total + y.total)) - - val housePriceAvg = housePriceTotal.mapValues(avgCount => avgCount.total / avgCount.count) - - val sortedHousePriceAvg = housePriceAvg.sortByKey() - - for ((bedrooms, avgPrice) <- sortedHousePriceAvg.collect()) println(bedrooms + " : " + avgPrice) - } - -} diff --git a/pairRdd/sort/SortedWordCountProblem.scala b/pairRdd/sort/SortedWordCountProblem.scala deleted file mode 100644 index 19cd5569..00000000 --- a/pairRdd/sort/SortedWordCountProblem.scala +++ /dev/null @@ -1,17 +0,0 @@ -package com.sparkTutorial.pairRdd.sort - - -object SortedWordCountProblem { - - /* Create a Spark program to read the an article from in/word_count.text, - output the number of occurrence of each word in descending order. - - Sample output: - - apple : 200 - shoes : 193 - bag : 176 - ... - */ -} - diff --git a/pairRdd/sort/SortedWordCountSolution.scala b/pairRdd/sort/SortedWordCountSolution.scala deleted file mode 100644 index 38e8abcb..00000000 --- a/pairRdd/sort/SortedWordCountSolution.scala +++ /dev/null @@ -1,28 +0,0 @@ -package com.sparkTutorial.pairRdd.sort - -import org.apache.log4j.{Level, Logger} -import org.apache.spark.{SparkConf, SparkContext} - -object SortedWordCountSolution { - - def main(args: Array[String]) { - - Logger.getLogger("org").setLevel(Level.ERROR) - val conf = new SparkConf().setAppName("wordCounts").setMaster("local[3]") - val sc = new SparkContext(conf) - - val lines = sc.textFile("in/word_count.text") - val wordRdd = lines.flatMap(line => line.split(" ")) - - val wordPairRdd = wordRdd.map(word => (word, 1)) - val wordToCountPairs = wordPairRdd.reduceByKey((x, y) => x + y) - - val countToWordParis = wordToCountPairs.map(wordToCount => (wordToCount._2, wordToCount._1)) - - val sortedCountToWordParis = countToWordParis.sortByKey(ascending = false) - - val sortedWordToCountPairs = sortedCountToWordParis.map(countToWord => (countToWord._2, countToWord._1)) - - for ((word, count) <- sortedWordToCountPairs.collect()) println(word + " : " + count) - } -} pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy