From 4f0a0147ff6234b420a3a4d447a7c068fa428fb7 Mon Sep 17 00:00:00 2001
From: Pedro Bernardo <pedromb2004@hotmailcom>
Date: Mon, 2 Oct 2017 12:36:54 +0200
Subject: [PATCH 01/10] Added pairRdd/sort/*.py

---
 pairRdd/sort/AverageHousePriceSolution.py | 23 +++++++++++++++++++++++
 pairRdd/sort/SortedWordCountProblem.py    | 16 ++++++++++++++++
 pairRdd/sort/SortedWordCountSolution.py   | 20 ++++++++++++++++++++
 3 files changed, 59 insertions(+)
 create mode 100644 pairRdd/sort/AverageHousePriceSolution.py
 create mode 100644 pairRdd/sort/SortedWordCountProblem.py
 create mode 100644 pairRdd/sort/SortedWordCountSolution.py

diff --git a/pairRdd/sort/AverageHousePriceSolution.py b/pairRdd/sort/AverageHousePriceSolution.py
new file mode 100644
index 00000000..4306fdbc
--- /dev/null
+++ b/pairRdd/sort/AverageHousePriceSolution.py
@@ -0,0 +1,23 @@
+from pairRdd.aggregation.reducebykey.housePrice.AvgCount import AvgCount
+from pyspark import SparkContext
+
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "averageHousePriceSolution")
+    sc.setLogLevel("ERROR")
+
+    lines = sc.textFile("in/RealEstate.csv")
+    cleanedLines = lines.filter(lambda line: "Bedrooms" not in line)
+    housePricePairRdd = cleanedLines.map(lambda line: \
+    ((int(float(line.split(",")[3]))), AvgCount(1, float(line.split(",")[2]))))
+
+    housePriceTotal = housePricePairRdd.reduceByKey(lambda x, y: \
+        AvgCount(x.count + y.count, x.total + y.total))
+
+    housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount.total / avgCount.count)
+
+    sortedHousePriceAvg = housePriceAvg.sortByKey()
+
+    for bedrooms, avgPrice in sortedHousePriceAvg.collect():
+        print("{} : {}".format(bedrooms, avgPrice))
diff --git a/pairRdd/sort/SortedWordCountProblem.py b/pairRdd/sort/SortedWordCountProblem.py
new file mode 100644
index 00000000..bc7817b4
--- /dev/null
+++ b/pairRdd/sort/SortedWordCountProblem.py
@@ -0,0 +1,16 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    '''
+    Create a Spark program to read the an article from in/word_count.text,
+    output the number of occurrence of each word in descending order.
+
+    Sample output:
+
+    apple : 200
+    shoes : 193
+    bag : 176
+    ...
+
+    '''
diff --git a/pairRdd/sort/SortedWordCountSolution.py b/pairRdd/sort/SortedWordCountSolution.py
new file mode 100644
index 00000000..398c57ae
--- /dev/null
+++ b/pairRdd/sort/SortedWordCountSolution.py
@@ -0,0 +1,20 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "wordCounts")
+    sc.setLogLevel("ERROR")
+    lines = sc.textFile("in/word_count.text")
+    wordRdd = lines.flatMap(lambda line: line.split(" "))
+
+    wordPairRdd = wordRdd.map(lambda word: (word, 1))
+    wordToCountPairs = wordPairRdd.reduceByKey(lambda x, y: x + y)
+
+    countToWordParis = wordToCountPairs.map(lambda wordToCount: (wordToCount[1], wordToCount[0]))
+
+    sortedCountToWordParis = countToWordParis.sortByKey(ascending=False)
+
+    sortedWordToCountPairs = sortedCountToWordParis.map(lambda countToWord: (countToWord[1], countToWord[0]))
+
+    for word, count in  sortedWordToCountPairs.collect():
+        print("{} : {}".format(word, count))

From 56aa850dd0c94eab7052d278a2c183746dce5a16 Mon Sep 17 00:00:00 2001
From: Pedro Bernardo <pedromb2004@hotmailcom>
Date: Mon, 2 Oct 2017 12:45:11 +0200
Subject: [PATCH 02/10] Added pairRdd/mapValues/*.py

---
 pairRdd/mapValues/AirportsUppercaseProblem.py | 24 +++++++++++++++++++
 .../mapValues/AirportsUppercaseSolution.py    | 17 +++++++++++++
 2 files changed, 41 insertions(+)
 create mode 100644 pairRdd/mapValues/AirportsUppercaseProblem.py
 create mode 100644 pairRdd/mapValues/AirportsUppercaseSolution.py

diff --git a/pairRdd/mapValues/AirportsUppercaseProblem.py b/pairRdd/mapValues/AirportsUppercaseProblem.py
new file mode 100644
index 00000000..124b37f0
--- /dev/null
+++ b/pairRdd/mapValues/AirportsUppercaseProblem.py
@@ -0,0 +1,24 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    '''
+    Create a Spark program to read the airport data from in/airports.text, generate a pair RDD with airport name
+    being the key and country name being the value. Then convert the country name to uppercase and
+    output the pair RDD to out/airports_uppercase.text
+
+    Each row of the input file contains the following columns:
+
+    Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code,
+    ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format
+
+    Sample output:
+
+    ("Kamloops", "CANADA")
+    ("Wewak Intl", "PAPUA NEW GUINEA")
+    ...
+
+    '''
+
+
+     
\ No newline at end of file
diff --git a/pairRdd/mapValues/AirportsUppercaseSolution.py b/pairRdd/mapValues/AirportsUppercaseSolution.py
new file mode 100644
index 00000000..3c9fa201
--- /dev/null
+++ b/pairRdd/mapValues/AirportsUppercaseSolution.py
@@ -0,0 +1,17 @@
+from pyspark import SparkContext
+from commons.Utils import Utils
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "airports")
+    sc.setLogLevel("ERROR")
+
+    airportsRDD = sc.textFile("in/airports.text")
+
+    airportPairRDD = airportsRDD.map(lambda line: \
+        (Utils.COMMA_DELIMITER.split(line)[1], \
+      Utils.COMMA_DELIMITER.split(line)[3]))
+
+    upperCase = airportPairRDD.mapValues(lambda countryName: countryName.upper())
+
+    upperCase.saveAsTextFile("out/airports_uppercase.text")

From 244fe9b11d5ba3ddb8d0d72d90501bef5093e9ce Mon Sep 17 00:00:00 2001
From: Pedro Bernardo <pedromb2004@hotmailcom>
Date: Mon, 2 Oct 2017 12:49:54 +0200
Subject: [PATCH 03/10] Added pairRdd/join/JoinOperations.py

---
 pairRdd/join/JoinOperations.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 pairRdd/join/JoinOperations.py

diff --git a/pairRdd/join/JoinOperations.py b/pairRdd/join/JoinOperations.py
new file mode 100644
index 00000000..250f9007
--- /dev/null
+++ b/pairRdd/join/JoinOperations.py
@@ -0,0 +1,21 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "JoinOperations")
+    sc.setLogLevel("ERROR")
+    
+    ages = sc.parallelize([("Tom", 29), ("John", 22)])
+    addresses = sc.parallelize([("James", "USA"), ("John", "UK")])
+
+    join = ages.join(addresses)
+    join.saveAsTextFile("out/age_address_join.text")
+
+    leftOuterJoin = ages.leftOuterJoin(addresses)
+    leftOuterJoin.saveAsTextFile("out/age_address_left_out_join.text")
+
+    rightOuterJoin = ages.rightOuterJoin(addresses)
+    rightOuterJoin.saveAsTextFile("out/age_address_right_out_join.text")
+
+    fullOuterJoin = ages.fullOuterJoin(addresses)
+    fullOuterJoin.saveAsTextFile("out/age_address_full_out_join.text")

From d6b58da2436d979a76bc0ed9ffa9027f65a2938f Mon Sep 17 00:00:00 2001
From: Pedro Bernardo <pedromb2004@hotmailcom>
Date: Mon, 2 Oct 2017 12:57:12 +0200
Subject: [PATCH 04/10] Added pairRdd/groupbykey/*.py

---
 .../groupbykey/AirportsByCountryProblem.py    | 23 +++++++++++++++++++
 .../groupbykey/AirportsByCountrySolution.py   | 18 +++++++++++++++
 pairRdd/groupbykey/GroupByKeyVsReduceByKey.py | 18 +++++++++++++++
 3 files changed, 59 insertions(+)
 create mode 100644 pairRdd/groupbykey/AirportsByCountryProblem.py
 create mode 100644 pairRdd/groupbykey/AirportsByCountrySolution.py
 create mode 100644 pairRdd/groupbykey/GroupByKeyVsReduceByKey.py

diff --git a/pairRdd/groupbykey/AirportsByCountryProblem.py b/pairRdd/groupbykey/AirportsByCountryProblem.py
new file mode 100644
index 00000000..6e4b1e24
--- /dev/null
+++ b/pairRdd/groupbykey/AirportsByCountryProblem.py
@@ -0,0 +1,23 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    '''
+    Create a Spark program to read the airport data from in/airports.text,
+    output the the list of the names of the airports located in each country.
+
+    Each row of the input file contains the following columns:
+    Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code,
+    ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format
+
+    Sample output:
+
+    "Canada", ["Bagotville", "Montreal", "Coronation", ...]
+    "Norway" : ["Vigra", "Andenes", "Alta", "Bomoen", "Bronnoy",..]
+    "Papua New Guinea",  ["Goroka", "Madang", ...]
+    ...
+
+    '''
+
+
+     
\ No newline at end of file
diff --git a/pairRdd/groupbykey/AirportsByCountrySolution.py b/pairRdd/groupbykey/AirportsByCountrySolution.py
new file mode 100644
index 00000000..0cc7017e
--- /dev/null
+++ b/pairRdd/groupbykey/AirportsByCountrySolution.py
@@ -0,0 +1,18 @@
+from pyspark import SparkContext
+from commons.Utils import Utils
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "airports")
+    sc.setLogLevel("ERROR")
+
+    lines = sc.textFile("in/airports.text")
+
+    countryAndAirportNameAndPair = lines.map(lambda airport:\
+         (Utils.COMMA_DELIMITER.split(airport)[3],
+          Utils.COMMA_DELIMITER.split(airport)[1]))
+
+    airportsByCountry = countryAndAirportNameAndPair.groupByKey()
+
+    for country, airportName in airportsByCountry.collectAsMap().items():
+        print("{}: {}".format(country,list(airportName)))
diff --git a/pairRdd/groupbykey/GroupByKeyVsReduceByKey.py b/pairRdd/groupbykey/GroupByKeyVsReduceByKey.py
new file mode 100644
index 00000000..99eb96df
--- /dev/null
+++ b/pairRdd/groupbykey/GroupByKeyVsReduceByKey.py
@@ -0,0 +1,18 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "GroupByKeyVsReduceByKey")
+    sc.setLogLevel("ERROR")
+
+    words = ["one", "two", "two", "three", "three", "three"]
+    wordsPairRdd = sc.parallelize(words).map(lambda word: (word, 1))
+
+    wordCountsWithReduceByKey = wordsPairRdd.reduceByKey(lambda x, y: x + y).collect()
+    print("wordCountsWithReduceByKey: {}".format(list(wordCountsWithReduceByKey)))
+
+    wordCountsWithGroupByKey = wordsPairRdd \
+        .groupByKey() \
+        .mapValues(lambda intIterable: len(intIterable)) \
+        .collect()
+    print("wordCountsWithGroupByKey: {}".format(list(wordCountsWithGroupByKey)))

From a2068e706d028a70d43bd52f76a464c0a6644e2b Mon Sep 17 00:00:00 2001
From: Pedro Bernardo <pedromb2004@hotmailcom>
Date: Mon, 2 Oct 2017 13:00:32 +0200
Subject: [PATCH 05/10] Added pairRdd/filter/*.py

---
 pairRdd/filter/AirportsNotInUsaProblem.py  | 20 ++++++++++++++++++++
 pairRdd/filter/AirportsNotInUsaSolution.py | 16 ++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 pairRdd/filter/AirportsNotInUsaProblem.py
 create mode 100644 pairRdd/filter/AirportsNotInUsaSolution.py

diff --git a/pairRdd/filter/AirportsNotInUsaProblem.py b/pairRdd/filter/AirportsNotInUsaProblem.py
new file mode 100644
index 00000000..d0e2da3a
--- /dev/null
+++ b/pairRdd/filter/AirportsNotInUsaProblem.py
@@ -0,0 +1,20 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    '''
+    Create a Spark program to read the airport data from in/airports.text;
+    generate a pair RDD with airport name being the key and country name being the value.
+    Then remove all the airports which are located in United States and output the pair RDD to out/airports_not_in_usa_pair_rdd.text
+
+    Each row of the input file contains the following columns:
+    Airport ID, Name of airport, Main city served by airport, Country where airport is located,
+    IATA/FAA code, ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format
+
+    Sample output:
+
+    ("Kamloops", "Canada")
+    ("Wewak Intl", "Papua New Guinea")
+    ...
+
+    '''
diff --git a/pairRdd/filter/AirportsNotInUsaSolution.py b/pairRdd/filter/AirportsNotInUsaSolution.py
new file mode 100644
index 00000000..fd9c2fe9
--- /dev/null
+++ b/pairRdd/filter/AirportsNotInUsaSolution.py
@@ -0,0 +1,16 @@
+from pyspark import SparkContext
+from commons.Utils import Utils
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "airports")
+    sc.setLogLevel("ERROR")
+
+    airportsRDD = sc.textFile("in/airports.text")
+
+    airportPairRDD = airportsRDD.map(lambda line: \
+        (Utils.COMMA_DELIMITER.split(line)[1],
+         Utils.COMMA_DELIMITER.split(line)[3]))
+    airportsNotInUSA = airportPairRDD.filter(lambda keyValue: keyValue[1] != "\"United States\"")
+
+    airportsNotInUSA.saveAsTextFile("out/airports_not_in_usa_pair_rdd.text")

From c6a3a82cdc24f6908cc6de7cad2cb8600216a085 Mon Sep 17 00:00:00 2001
From: Pedro Bernardo <pedromb2004@hotmailcom>
Date: Mon, 2 Oct 2017 13:03:24 +0200
Subject: [PATCH 06/10] Added pairRdd/create/*.py

---
 pairRdd/create/PairRddFromRegularRdd.py | 12 ++++++++++++
 pairRdd/create/PairRddFromTupleList.py  | 11 +++++++++++
 2 files changed, 23 insertions(+)
 create mode 100644 pairRdd/create/PairRddFromRegularRdd.py
 create mode 100644 pairRdd/create/PairRddFromTupleList.py

diff --git a/pairRdd/create/PairRddFromRegularRdd.py b/pairRdd/create/PairRddFromRegularRdd.py
new file mode 100644
index 00000000..bfd6f187
--- /dev/null
+++ b/pairRdd/create/PairRddFromRegularRdd.py
@@ -0,0 +1,12 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "create")
+    sc.setLogLevel("ERROR")
+
+    inputStrings = ["Lily 23", "Jack 29", "Mary 29", "James 8"]
+    regularRDDs = sc.parallelize(inputStrings)
+
+    pairRDD = regularRDDs.map(lambda s: (s.split(" ")[0], s.split(" ")[1]))
+    pairRDD.coalesce(1).saveAsTextFile("out/pair_rdd_from_regular_rdd")
diff --git a/pairRdd/create/PairRddFromTupleList.py b/pairRdd/create/PairRddFromTupleList.py
new file mode 100644
index 00000000..c728d005
--- /dev/null
+++ b/pairRdd/create/PairRddFromTupleList.py
@@ -0,0 +1,11 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "create")
+    sc.setLogLevel("ERROR")
+
+    tuples = [("Lily", 23), ("Jack", 29), ("Mary", 29), ("James", 8)]
+    pairRDD = sc.parallelize(tuples)
+
+    pairRDD.coalesce(1).saveAsTextFile("out/pair_rdd_from_tuple_list")

From 7bcccc9aa2fc88d2742795120ccd2eb5935f421f Mon Sep 17 00:00:00 2001
From: Pedro Bernardo <pedromb2004@hotmailcom>
Date: Mon, 2 Oct 2017 13:13:59 +0200
Subject: [PATCH 07/10] Added
 pairRdd/aggregation/combinebykey/AverageHousePriceSolution.py

---
 .../combinebykey/AverageHousePriceSolution.py | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 pairRdd/aggregation/combinebykey/AverageHousePriceSolution.py

diff --git a/pairRdd/aggregation/combinebykey/AverageHousePriceSolution.py b/pairRdd/aggregation/combinebykey/AverageHousePriceSolution.py
new file mode 100644
index 00000000..4885ccbe
--- /dev/null
+++ b/pairRdd/aggregation/combinebykey/AverageHousePriceSolution.py
@@ -0,0 +1,21 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "AverageHousePrice")
+    sc.setLogLevel("ERROR")
+
+    lines = sc.textFile("in/RealEstate.csv")
+    cleanedLines = lines.filter(lambda line: "Bedrooms" not in line)
+
+    housePricePairRdd = cleanedLines.map(lambda line: (line.split(",")[3], float(line.split(",")[2])))
+
+    createCombiner = lambda x: (1, x)
+    mergeValue = lambda avgCount, x: (avgCount[0] + 1, avgCount[1] + x)
+    mergeCombiners = lambda avgCountA, avgCountB: (avgCountA[0] + avgCountB[0], avgCountA[1] + avgCountB[1])
+
+    housePriceTotal = housePricePairRdd.combineByKey(createCombiner, mergeValue, mergeCombiners)
+
+    housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount[1] / avgCount[0])
+    for bedrooms, avgPrice in housePriceAvg.collect():
+        print("{} : {}".format(bedrooms, avgPrice))

From 43f7883c03722e16c5b3fa0809f574240613862c Mon Sep 17 00:00:00 2001
From: Pedro Bernardo <pedromb2004@hotmailcom>
Date: Mon, 2 Oct 2017 13:16:18 +0200
Subject: [PATCH 08/10] Added pairRdd/aggregation/reducebykey/WordCount.py

---
 pairRdd/aggregation/reducebykey/WordCount.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 pairRdd/aggregation/reducebykey/WordCount.py

diff --git a/pairRdd/aggregation/reducebykey/WordCount.py b/pairRdd/aggregation/reducebykey/WordCount.py
new file mode 100644
index 00000000..3a00f380
--- /dev/null
+++ b/pairRdd/aggregation/reducebykey/WordCount.py
@@ -0,0 +1,14 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "wordCounts")
+    sc.setLogLevel("ERROR")
+
+    lines = sc.textFile("in/word_count.text")
+    wordRdd = lines.flatMap(lambda line: line.split(" "))
+    wordPairRdd = wordRdd.map(lambda word: (word, 1))
+
+    wordCounts = wordPairRdd.reduceByKey(lambda x, y: x + y)
+    for word, count in wordCounts.collect():
+        print("{} : {}".format(word, count))

From fb3dfcbf7b00b5e7026c9b8aa8fd997d92d79d58 Mon Sep 17 00:00:00 2001
From: Pedro Bernardo <pedromb2004@hotmailcom>
Date: Mon, 2 Oct 2017 13:19:35 +0200
Subject: [PATCH 09/10] Added pairRdd/aggregation/reducebykey/housePrice/*.py

---
 .../housePrice/AverageHousePriceProblem.py    | 35 +++++++++++++++++++
 .../housePrice/AverageHousePriceSolution.py   | 24 +++++++++++++
 .../reducebykey/housePrice/AvgCount.py        |  7 ++++
 3 files changed, 66 insertions(+)
 create mode 100644 pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.py
 create mode 100644 pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.py
 create mode 100644 pairRdd/aggregation/reducebykey/housePrice/AvgCount.py

diff --git a/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.py b/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.py
new file mode 100644
index 00000000..caf05e89
--- /dev/null
+++ b/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.py
@@ -0,0 +1,35 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    '''
+    Create a Spark program to read the house data from in/RealEstate.csv,
+    output the average price for houses with different number of bedrooms.
+
+    The houses dataset contains a collection of recent real estate listings in San Luis Obispo county and
+    around it. 
+
+    The dataset contains the following fields:
+    1. MLS: Multiple listing service number for the house (unique ID).
+    2. Location: city/town where the house is located. Most locations are in San Luis Obispo county and
+    northern Santa Barbara county (Santa Maria­Orcutt, Lompoc, Guadelupe, Los Alamos), but there
+    some out of area locations as well.
+    3. Price: the most recent listing price of the house (in dollars).
+    4. Bedrooms: number of bedrooms.
+    5. Bathrooms: number of bathrooms.
+    6. Size: size of the house in square feet.
+    7. Price/SQ.ft: price of the house per square foot.
+    8. Status: type of sale. Thee types are represented in the dataset: Short Sale, Foreclosure and Regular.
+
+    Each field is comma separated.
+
+    Sample output:
+
+       (3, 325000)
+       (1, 266356)
+       (2, 325000)
+       ...
+
+    3, 1 and 2 mean the number of bedrooms. 325000 means the average price of houses with 3 bedrooms is 325000.
+
+    '''
diff --git a/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.py b/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.py
new file mode 100644
index 00000000..acb633e2
--- /dev/null
+++ b/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.py
@@ -0,0 +1,24 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "avgHousePrice")
+    sc.setLogLevel("ERROR")
+
+    lines = sc.textFile("in/RealEstate.csv")
+    cleanedLines = lines.filter(lambda line: "Bedrooms" not in line)
+
+    housePricePairRdd = cleanedLines.map(lambda line: \
+        (line.split(",")[3], (1, float(line.split(",")[2]))))
+
+    housePriceTotal = housePricePairRdd \
+        .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
+
+    print("housePriceTotal: ")
+    for bedroom, total in housePriceTotal.collect():
+        print("{} : {}".format(bedroom, total))
+
+    housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount[1] / avgCount[0])
+    print("\nhousePriceAvg: ")
+    for bedroom, avg in housePriceAvg.collect():
+        print("{} : {}".format(bedroom, avg))
diff --git a/pairRdd/aggregation/reducebykey/housePrice/AvgCount.py b/pairRdd/aggregation/reducebykey/housePrice/AvgCount.py
new file mode 100644
index 00000000..37a59ede
--- /dev/null
+++ b/pairRdd/aggregation/reducebykey/housePrice/AvgCount.py
@@ -0,0 +1,7 @@
+class AvgCount():
+
+    def __init__(self, count: int, total: float):
+        self.count = count
+        self.total = total
+
+

From ee71addec096eea8c6f0c81c744bb596b71e811d Mon Sep 17 00:00:00 2001
From: Pedro Bernardo <pedromb2004@hotmailcom>
Date: Mon, 2 Oct 2017 13:20:38 +0200
Subject: [PATCH 10/10] Finished pairRdd folder | Removed scala files from
 folder pairRdd

---
 .../AverageHousePriceSolution.scala           | 30 ---------------
 .../aggregation/reducebykey/WordCount.scala   | 20 ----------
 .../housePrice/AverageHousePriceProblem.scala | 38 -------------------
 .../AverageHousePriceSolution.scala           | 27 -------------
 .../reducebykey/housePrice/AvgCount.scala     |  4 --
 pairRdd/create/PairRddFromRegularRdd.scala    | 18 ---------
 pairRdd/create/PairRddFromTupleList.scala     | 17 ---------
 pairRdd/filter/AirportsNotInUsaProblem.scala  | 22 -----------
 pairRdd/filter/AirportsNotInUsaSolution.scala | 21 ----------
 .../groupbykey/AirportsByCountryProblem.scala | 22 -----------
 .../AirportsByCountrySolution.scala           | 23 -----------
 .../groupbykey/GroupByKeyVsReduceByKey.scala  | 23 -----------
 pairRdd/join/JoinOperations.scala             | 27 -------------
 .../mapValues/AirportsUppercaseProblem.scala  | 23 -----------
 .../mapValues/AirportsUppercaseSolution.scala | 22 -----------
 pairRdd/sort/AverageHousePriceSolution.scala  | 28 --------------
 pairRdd/sort/SortedWordCountProblem.scala     | 17 ---------
 pairRdd/sort/SortedWordCountSolution.scala    | 28 --------------
 18 files changed, 410 deletions(-)
 delete mode 100644 pairRdd/aggregation/combinebykey/AverageHousePriceSolution.scala
 delete mode 100644 pairRdd/aggregation/reducebykey/WordCount.scala
 delete mode 100644 pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.scala
 delete mode 100644 pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.scala
 delete mode 100644 pairRdd/aggregation/reducebykey/housePrice/AvgCount.scala
 delete mode 100644 pairRdd/create/PairRddFromRegularRdd.scala
 delete mode 100644 pairRdd/create/PairRddFromTupleList.scala
 delete mode 100644 pairRdd/filter/AirportsNotInUsaProblem.scala
 delete mode 100644 pairRdd/filter/AirportsNotInUsaSolution.scala
 delete mode 100644 pairRdd/groupbykey/AirportsByCountryProblem.scala
 delete mode 100644 pairRdd/groupbykey/AirportsByCountrySolution.scala
 delete mode 100644 pairRdd/groupbykey/GroupByKeyVsReduceByKey.scala
 delete mode 100644 pairRdd/join/JoinOperations.scala
 delete mode 100644 pairRdd/mapValues/AirportsUppercaseProblem.scala
 delete mode 100644 pairRdd/mapValues/AirportsUppercaseSolution.scala
 delete mode 100644 pairRdd/sort/AverageHousePriceSolution.scala
 delete mode 100644 pairRdd/sort/SortedWordCountProblem.scala
 delete mode 100644 pairRdd/sort/SortedWordCountSolution.scala

diff --git a/pairRdd/aggregation/combinebykey/AverageHousePriceSolution.scala b/pairRdd/aggregation/combinebykey/AverageHousePriceSolution.scala
deleted file mode 100644
index e744546f..00000000
--- a/pairRdd/aggregation/combinebykey/AverageHousePriceSolution.scala
+++ /dev/null
@@ -1,30 +0,0 @@
-package com.sparkTutorial.pairRdd.aggregation.combinebykey
-
-import org.apache.log4j.{Level, Logger}
-import org.apache.spark.{SparkConf, SparkContext}
-
-object AverageHousePriceSolution {
-
-  def main(args: Array[String]) {
-
-    Logger.getLogger("org").setLevel(Level.ERROR)
-    val conf = new SparkConf().setAppName("wordCounts").setMaster("local[3]")
-    val sc = new SparkContext(conf)
-
-    val lines = sc.textFile("in/RealEstate.csv")
-    val cleanedLines = lines.filter(line => !line.contains("Bedrooms"))
-
-    val housePricePairRdd = cleanedLines.map(line => (line.split(",")(3), line.split(",")(2).toDouble))
-
-    val createCombiner = (x: Double) => (1, x)
-    val mergeValue = (avgCount: AvgCount, x: Double) => (avgCount._1 + 1, avgCount._2 + x)
-    val mergeCombiners = (avgCountA: AvgCount, avgCountB: AvgCount) => (avgCountA._1 + avgCountB._1, avgCountA._2 + avgCountB._2)
-
-    val housePriceTotal = housePricePairRdd.combineByKey(createCombiner, mergeValue, mergeCombiners)
-
-    val housePriceAvg = housePriceTotal.mapValues(avgCount => avgCount._2 / avgCount._1)
-    for ((bedrooms, avgPrice) <- housePriceAvg.collect()) println(bedrooms + " : " + avgPrice)
-  }
-
-  type AvgCount = (Int, Double)
-}
diff --git a/pairRdd/aggregation/reducebykey/WordCount.scala b/pairRdd/aggregation/reducebykey/WordCount.scala
deleted file mode 100644
index 89d51803..00000000
--- a/pairRdd/aggregation/reducebykey/WordCount.scala
+++ /dev/null
@@ -1,20 +0,0 @@
-package com.sparkTutorial.pairRdd.aggregation.reducebykey
-
-import org.apache.log4j.{Level, Logger}
-import org.apache.spark.{SparkConf, SparkContext}
-
-object WordCount {
-
-  def main(args: Array[String]) {
-    Logger.getLogger("org").setLevel(Level.ERROR)
-    val conf = new SparkConf().setAppName("wordCounts").setMaster("local[3]")
-    val sc = new SparkContext(conf)
-
-    val lines = sc.textFile("in/word_count.text")
-    val wordRdd = lines.flatMap(line => line.split(" "))
-    val wordPairRdd = wordRdd.map(word => (word, 1))
-
-    val wordCounts = wordPairRdd.reduceByKey((x, y) => x + y)
-    for ((word, count) <- wordCounts.collect()) println(word + " : " + count)
-  }
-}
diff --git a/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.scala b/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.scala
deleted file mode 100644
index 1d433a6b..00000000
--- a/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.scala
+++ /dev/null
@@ -1,38 +0,0 @@
-package com.sparkTutorial.pairRdd.aggregation.reducebykey.housePrice
-
-object AverageHousePriceProblem {
-
-  def main(args: Array[String]) {
-
-    /* Create a Spark program to read the house data from in/RealEstate.csv,
-       output the average price for houses with different number of bedrooms.
-
-    The houses dataset contains a collection of recent real estate listings in San Luis Obispo county and
-    around it. 
-
-    The dataset contains the following fields:
-    1. MLS: Multiple listing service number for the house (unique ID).
-    2. Location: city/town where the house is located. Most locations are in San Luis Obispo county and
-    northern Santa Barbara county (Santa Maria­Orcutt, Lompoc, Guadelupe, Los Alamos), but there
-    some out of area locations as well.
-    3. Price: the most recent listing price of the house (in dollars).
-    4. Bedrooms: number of bedrooms.
-    5. Bathrooms: number of bathrooms.
-    6. Size: size of the house in square feet.
-    7. Price/SQ.ft: price of the house per square foot.
-    8. Status: type of sale. Thee types are represented in the dataset: Short Sale, Foreclosure and Regular.
-
-    Each field is comma separated.
-
-    Sample output:
-
-       (3, 325000)
-       (1, 266356)
-       (2, 325000)
-       ...
-
-       3, 1 and 2 mean the number of bedrooms. 325000 means the average price of houses with 3 bedrooms is 325000.
-     */
-  }
-
-}
diff --git a/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.scala b/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.scala
deleted file mode 100644
index 29cf8b38..00000000
--- a/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.scala
+++ /dev/null
@@ -1,27 +0,0 @@
-package com.sparkTutorial.pairRdd.aggregation.reducebykey.housePrice
-
-import org.apache.log4j.{Level, Logger}
-import org.apache.spark.{SparkConf, SparkContext}
-
-object AverageHousePriceSolution {
-
-  def main(args: Array[String]) {
-    Logger.getLogger("org").setLevel(Level.ERROR)
-    val conf = new SparkConf().setAppName("avgHousePrice").setMaster("local[3]")
-    val sc = new SparkContext(conf)
-
-    val lines = sc.textFile("in/RealEstate.csv")
-    val cleanedLines = lines.filter(line => !line.contains("Bedrooms"))
-
-    val housePricePairRdd = cleanedLines.map(line => (line.split(",")(3), (1, line.split(",")(2).toDouble)))
-
-    val housePriceTotal = housePricePairRdd.reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2))
-
-    println("housePriceTotal: ")
-    for ((bedroom, total) <- housePriceTotal.collect()) println(bedroom + " : " + total)
-
-    val housePriceAvg = housePriceTotal.mapValues(avgCount => avgCount._2 / avgCount._1)
-    println("housePriceAvg: ")
-    for ((bedroom, avg) <- housePriceAvg.collect()) println(bedroom + " : " + avg)
-  }
-}
diff --git a/pairRdd/aggregation/reducebykey/housePrice/AvgCount.scala b/pairRdd/aggregation/reducebykey/housePrice/AvgCount.scala
deleted file mode 100644
index 8eb0ea9a..00000000
--- a/pairRdd/aggregation/reducebykey/housePrice/AvgCount.scala
+++ /dev/null
@@ -1,4 +0,0 @@
-package com.sparkTutorial.pairRdd.aggregation.reducebykey.housePrice
-
-case class AvgCount(count: Int, total: Double)
-
diff --git a/pairRdd/create/PairRddFromRegularRdd.scala b/pairRdd/create/PairRddFromRegularRdd.scala
deleted file mode 100644
index 5fa8ec05..00000000
--- a/pairRdd/create/PairRddFromRegularRdd.scala
+++ /dev/null
@@ -1,18 +0,0 @@
-package com.sparkTutorial.pairRdd.create
-
-import org.apache.spark.{SparkConf, SparkContext}
-
-object PairRddFromRegularRdd {
-
-  def main(args: Array[String]) {
-
-    val conf = new SparkConf().setAppName("create").setMaster("local[1]")
-    val sc = new SparkContext(conf)
-
-    val inputStrings = List("Lily 23", "Jack 29", "Mary 29", "James 8")
-    val regularRDDs = sc.parallelize(inputStrings)
-
-    val pairRDD = regularRDDs.map(s => (s.split(" ")(0), s.split(" ")(1)))
-    pairRDD.coalesce(1).saveAsTextFile("out/pair_rdd_from_regular_rdd")
-  }
-}
diff --git a/pairRdd/create/PairRddFromTupleList.scala b/pairRdd/create/PairRddFromTupleList.scala
deleted file mode 100644
index 3b9a7632..00000000
--- a/pairRdd/create/PairRddFromTupleList.scala
+++ /dev/null
@@ -1,17 +0,0 @@
-package com.sparkTutorial.pairRdd.create
-
-import org.apache.spark.{SparkConf, SparkContext}
-
-object PairRddFromTupleList {
-
-  def main(args: Array[String]) {
-
-    val conf = new SparkConf().setAppName("create").setMaster("local[1]")
-    val sc = new SparkContext(conf)
-
-    val tuple = List(("Lily", 23), ("Jack", 29), ("Mary", 29), ("James", 8))
-    val pairRDD = sc.parallelize(tuple)
-
-    pairRDD.coalesce(1).saveAsTextFile("out/pair_rdd_from_tuple_list")
-  }
-}
diff --git a/pairRdd/filter/AirportsNotInUsaProblem.scala b/pairRdd/filter/AirportsNotInUsaProblem.scala
deleted file mode 100644
index b1613b53..00000000
--- a/pairRdd/filter/AirportsNotInUsaProblem.scala
+++ /dev/null
@@ -1,22 +0,0 @@
-package com.sparkTutorial.pairRdd.filter
-
-object AirportsNotInUsaProblem {
-
-  def main(args: Array[String]) {
-
-    /* Create a Spark program to read the airport data from in/airports.text;
-       generate a pair RDD with airport name being the key and country name being the value.
-       Then remove all the airports which are located in United States and output the pair RDD to out/airports_not_in_usa_pair_rdd.text
-
-       Each row of the input file contains the following columns:
-       Airport ID, Name of airport, Main city served by airport, Country where airport is located,
-       IATA/FAA code, ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format
-
-       Sample output:
-
-       ("Kamloops", "Canada")
-       ("Wewak Intl", "Papua New Guinea")
-       ...
-     */
-  }
-}
diff --git a/pairRdd/filter/AirportsNotInUsaSolution.scala b/pairRdd/filter/AirportsNotInUsaSolution.scala
deleted file mode 100644
index b014ffec..00000000
--- a/pairRdd/filter/AirportsNotInUsaSolution.scala
+++ /dev/null
@@ -1,21 +0,0 @@
-package com.sparkTutorial.pairRdd.filter
-
-import com.sparkTutorial.commons.Utils
-import org.apache.spark.{SparkConf, SparkContext}
-
-object AirportsNotInUsaSolution {
-
-  def main(args: Array[String]) {
-
-    val conf = new SparkConf().setAppName("airports").setMaster("local")
-    val sc = new SparkContext(conf)
-
-    val airportsRDD = sc.textFile("in/airports.text")
-
-    val airportPairRDD = airportsRDD.map(line => (line.split(Utils.COMMA_DELIMITER)(1),
-      line.split(Utils.COMMA_DELIMITER)(3)))
-    val airportsNotInUSA = airportPairRDD.filter(keyValue => keyValue._2 != "\"United States\"")
-
-    airportsNotInUSA.saveAsTextFile("out/airports_not_in_usa_pair_rdd.text")
-  }
-}
diff --git a/pairRdd/groupbykey/AirportsByCountryProblem.scala b/pairRdd/groupbykey/AirportsByCountryProblem.scala
deleted file mode 100644
index 5bfd7f3d..00000000
--- a/pairRdd/groupbykey/AirportsByCountryProblem.scala
+++ /dev/null
@@ -1,22 +0,0 @@
-package com.sparkTutorial.pairRdd.groupbykey
-
-object AirportsByCountryProblem {
-
-  def main(args: Array[String]) {
-
-    /* Create a Spark program to read the airport data from in/airports.text,
-       output the the list of the names of the airports located in each country.
-
-       Each row of the input file contains the following columns:
-       Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code,
-       ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format
-
-       Sample output:
-
-       "Canada", List("Bagotville", "Montreal", "Coronation", ...)
-       "Norway" : List("Vigra", "Andenes", "Alta", "Bomoen", "Bronnoy",..)
-       "Papua New Guinea",  List("Goroka", "Madang", ...)
-       ...
-     */
-  }
-}
diff --git a/pairRdd/groupbykey/AirportsByCountrySolution.scala b/pairRdd/groupbykey/AirportsByCountrySolution.scala
deleted file mode 100644
index 09aa25fa..00000000
--- a/pairRdd/groupbykey/AirportsByCountrySolution.scala
+++ /dev/null
@@ -1,23 +0,0 @@
-package com.sparkTutorial.pairRdd.groupbykey
-
-import com.sparkTutorial.commons.Utils
-import org.apache.log4j.{Level, Logger}
-import org.apache.spark.{SparkConf, SparkContext}
-
-object AirportsByCountrySolution {
-
-  def main(args: Array[String]) {
-    Logger.getLogger("org").setLevel(Level.ERROR)
-    val conf = new SparkConf().setAppName("airports").setMaster("local[*]")
-    val sc = new SparkContext(conf)
-
-    val lines = sc.textFile("in/airports.text")
-
-    val countryAndAirportNameAndPair = lines.map(airport => (airport.split(Utils.COMMA_DELIMITER)(3),
-                                                             airport.split(Utils.COMMA_DELIMITER)(1)))
-
-    val airportsByCountry = countryAndAirportNameAndPair.groupByKey()
-
-    for ((country, airportName) <- airportsByCountry.collectAsMap()) println(country + ": " + airportName.toList)
-  }
-}
diff --git a/pairRdd/groupbykey/GroupByKeyVsReduceByKey.scala b/pairRdd/groupbykey/GroupByKeyVsReduceByKey.scala
deleted file mode 100644
index 8ae2f601..00000000
--- a/pairRdd/groupbykey/GroupByKeyVsReduceByKey.scala
+++ /dev/null
@@ -1,23 +0,0 @@
-package com.sparkTutorial.pairRdd.groupbykey
-
-import org.apache.log4j.{Level, Logger}
-import org.apache.spark.{SparkConf, SparkContext}
-
-object GroupByKeyVsReduceByKey {
-
-  def main(args: Array[String]) {
-    Logger.getLogger("org").setLevel(Level.ERROR)
-    val conf = new SparkConf().setAppName("GroupByKeyVsReduceByKey").setMaster("local[*]")
-    val sc = new SparkContext(conf)
-
-    val words = List("one", "two", "two", "three", "three", "three")
-    val wordsPairRdd = sc.parallelize(words).map(word => (word, 1))
-
-    val wordCountsWithReduceByKey = wordsPairRdd.reduceByKey((x, y) => x + y).collect()
-    println("wordCountsWithReduceByKey: " + wordCountsWithReduceByKey.toList)
-
-    val wordCountsWithGroupByKey = wordsPairRdd.groupByKey().mapValues(intIterable => intIterable.size).collect()
-    println("wordCountsWithGroupByKey: " + wordCountsWithGroupByKey.toList)
-  }
-}
-
diff --git a/pairRdd/join/JoinOperations.scala b/pairRdd/join/JoinOperations.scala
deleted file mode 100644
index 549a5745..00000000
--- a/pairRdd/join/JoinOperations.scala
+++ /dev/null
@@ -1,27 +0,0 @@
-package com.sparkTutorial.pairRdd.join
-
-import org.apache.spark.{SparkConf, SparkContext}
-
-object JoinOperations {
-
-    def main(args: Array[String]) {
-
-        val conf = new SparkConf().setAppName("JoinOperations").setMaster("local[1]")
-        val sc = new SparkContext(conf)
-
-        val ages = sc.parallelize(List(("Tom", 29),("John", 22)))
-        val addresses = sc.parallelize(List(("James", "USA"), ("John", "UK")))
-
-        val join = ages.join(addresses)
-        join.saveAsTextFile("out/age_address_join.text")
-
-        val leftOuterJoin = ages.leftOuterJoin(addresses)
-        leftOuterJoin.saveAsTextFile("out/age_address_left_out_join.text")
-
-        val rightOuterJoin = ages.rightOuterJoin(addresses)
-        rightOuterJoin.saveAsTextFile("out/age_address_right_out_join.text")
-
-        val fullOuterJoin = ages.fullOuterJoin(addresses)
-        fullOuterJoin.saveAsTextFile("out/age_address_full_out_join.text")
-    }
-}
diff --git a/pairRdd/mapValues/AirportsUppercaseProblem.scala b/pairRdd/mapValues/AirportsUppercaseProblem.scala
deleted file mode 100644
index ac9dac2f..00000000
--- a/pairRdd/mapValues/AirportsUppercaseProblem.scala
+++ /dev/null
@@ -1,23 +0,0 @@
-package com.sparkTutorial.pairRdd.mapValues
-
-object AirportsUppercaseProblem {
-
-  def main(args: Array[String]) {
-
-    /* Create a Spark program to read the airport data from in/airports.text, generate a pair RDD with airport name
-       being the key and country name being the value. Then convert the country name to uppercase and
-       output the pair RDD to out/airports_uppercase.text
-
-       Each row of the input file contains the following columns:
-
-       Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code,
-       ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format
-
-       Sample output:
-
-       ("Kamloops", "CANADA")
-       ("Wewak Intl", "PAPUA NEW GUINEA")
-       ...
-     */
-  }
-}
diff --git a/pairRdd/mapValues/AirportsUppercaseSolution.scala b/pairRdd/mapValues/AirportsUppercaseSolution.scala
deleted file mode 100644
index 093fb193..00000000
--- a/pairRdd/mapValues/AirportsUppercaseSolution.scala
+++ /dev/null
@@ -1,22 +0,0 @@
-package com.sparkTutorial.pairRdd.mapValues
-
-import com.sparkTutorial.commons.Utils
-import org.apache.spark.{SparkConf, SparkContext}
-
-object AirportsUppercaseSolution {
-
-  def main(args: Array[String]) {
-
-    val conf = new SparkConf().setAppName("airports").setMaster("local")
-    val sc = new SparkContext(conf)
-
-    val airportsRDD = sc.textFile("in/airports.text")
-
-    val airportPairRDD = airportsRDD.map((line: String) => (line.split(Utils.COMMA_DELIMITER)(1),
-      line.split(Utils.COMMA_DELIMITER)(3)))
-
-    val upperCase = airportPairRDD.mapValues(countryName => countryName.toUpperCase)
-
-    upperCase.saveAsTextFile("out/airports_uppercase.text")
-  }
-}
diff --git a/pairRdd/sort/AverageHousePriceSolution.scala b/pairRdd/sort/AverageHousePriceSolution.scala
deleted file mode 100644
index 56eb591b..00000000
--- a/pairRdd/sort/AverageHousePriceSolution.scala
+++ /dev/null
@@ -1,28 +0,0 @@
-package com.sparkTutorial.pairRdd.sort
-
-import com.sparkTutorial.pairRdd.aggregation.reducebykey.housePrice.AvgCount
-import org.apache.log4j.{Level, Logger}
-import org.apache.spark.{SparkConf, SparkContext}
-
-object AverageHousePriceSolution {
-
-  def main(args: Array[String]) {
-    Logger.getLogger("org").setLevel(Level.ERROR)
-    val conf = new SparkConf().setAppName("averageHousePriceSolution").setMaster("local[3]")
-    val sc = new SparkContext(conf)
-
-    val lines = sc.textFile("in/RealEstate.csv")
-    val cleanedLines = lines.filter(line => !line.contains("Bedrooms"))
-    val housePricePairRdd = cleanedLines.map(
-      line => (line.split(",")(3).toInt, AvgCount(1, line.split(",")(2).toDouble)))
-
-    val housePriceTotal = housePricePairRdd.reduceByKey((x, y) => AvgCount(x.count + y.count, x.total + y.total))
-
-    val housePriceAvg = housePriceTotal.mapValues(avgCount => avgCount.total / avgCount.count)
-
-    val sortedHousePriceAvg = housePriceAvg.sortByKey()
-
-    for ((bedrooms, avgPrice) <- sortedHousePriceAvg.collect()) println(bedrooms + " : " + avgPrice)
-  }
-
-}
diff --git a/pairRdd/sort/SortedWordCountProblem.scala b/pairRdd/sort/SortedWordCountProblem.scala
deleted file mode 100644
index 19cd5569..00000000
--- a/pairRdd/sort/SortedWordCountProblem.scala
+++ /dev/null
@@ -1,17 +0,0 @@
-package com.sparkTutorial.pairRdd.sort
-
-
-object SortedWordCountProblem {
-
-    /* Create a Spark program to read the an article from in/word_count.text,
-       output the number of occurrence of each word in descending order.
-
-       Sample output:
-
-       apple : 200
-       shoes : 193
-       bag : 176
-       ...
-     */
-}
-
diff --git a/pairRdd/sort/SortedWordCountSolution.scala b/pairRdd/sort/SortedWordCountSolution.scala
deleted file mode 100644
index 38e8abcb..00000000
--- a/pairRdd/sort/SortedWordCountSolution.scala
+++ /dev/null
@@ -1,28 +0,0 @@
-package com.sparkTutorial.pairRdd.sort
-
-import org.apache.log4j.{Level, Logger}
-import org.apache.spark.{SparkConf, SparkContext}
-
-object SortedWordCountSolution {
-
-  def main(args: Array[String]) {
-
-    Logger.getLogger("org").setLevel(Level.ERROR)
-    val conf = new SparkConf().setAppName("wordCounts").setMaster("local[3]")
-    val sc = new SparkContext(conf)
-
-    val lines = sc.textFile("in/word_count.text")
-    val wordRdd = lines.flatMap(line => line.split(" "))
-
-    val wordPairRdd = wordRdd.map(word => (word, 1))
-    val wordToCountPairs = wordPairRdd.reduceByKey((x, y) => x + y)
-
-    val countToWordParis = wordToCountPairs.map(wordToCount => (wordToCount._2, wordToCount._1))
-
-    val sortedCountToWordParis = countToWordParis.sortByKey(ascending = false)
-
-    val sortedWordToCountPairs = sortedCountToWordParis.map(countToWord => (countToWord._2, countToWord._1))
-
-    for ((word, count) <- sortedWordToCountPairs.collect()) println(word + " : " + count)
-  }
-}

<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Transitional//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>
<html xmlns='http://www.w3.org/1999/xhtml'>
<head>
<title>pFad - Phonifier reborn</title>
<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />
</head>
<body>
<h1>Pfad - The Proxy pFad of &#169; 2024 Garber Painting. All rights reserved.</h1>


<!-- Disclaimer -->
<p>Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.</p>
<br>
<p>Alternative Proxies:</p><p><a href="http://rainy.clevelandohioweatherforecast.com/php-proxy/index.php?q=https://patch-diff.githubusercontent.com/raw/jleetutorial/python-spark-tutorial/pull/4.patch" target="_blank">Alternative Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/index.php?u=https://patch-diff.githubusercontent.com/raw/jleetutorial/python-spark-tutorial/pull/4.patch" target="_blank">pFad Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v3index.php?u=https://patch-diff.githubusercontent.com/raw/jleetutorial/python-spark-tutorial/pull/4.patch" target="_blank">pFad v3 Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v4index.php?u=https://patch-diff.githubusercontent.com/raw/jleetutorial/python-spark-tutorial/pull/4.patch" target="_blank">pFad v4 Proxy</a></p></body>
</html>