From 942672bc09be7e6c23d508191826ca09d970e6e8 Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Wed, 31 Jan 2018 06:55:49 -0800 Subject: [PATCH 1/2] Added sys path to guarantee imports | Added SparkConf to all files --- advanced/accumulator/StackOverFlowSurvey.py | 29 ++++++++------- .../StackOverFlowSurveyFollowUp.py | 27 +++++++------- advanced/broadcast/UkMakerSpaces.py | 18 +++++----- .../UkMakerSpacesWithoutBroadcast.py | 22 ++++++------ .../combinebykey/AverageHousePriceSolution.py | 7 ++-- .../housePrice/AverageHousePriceSolution.py | 20 ++++++----- pairRdd/filter/AirportsNotInUsaSolution.py | 8 +++-- .../groupbykey/AirportsByCountrySolution.py | 10 +++--- .../mapValues/AirportsUppercaseSolution.py | 9 ++--- pairRdd/sort/AverageHousePriceSolution.py | 10 +++--- rdd/airports/AirportsByLatitudeSolution.py | 7 ++-- rdd/airports/AirportsInUsaSolution.py | 7 ++-- rdd/count/CountExample.py | 3 ++ sparkSql/HousePriceProblem.py | 13 +++---- sparkSql/HousePriceSolution.py | 4 +-- sparkSql/RddDataframeConversion.py | 25 ++++++------- sparkSql/StackOverFlowSurvey.py | 36 ++++++++++++------- sparkSql/join/UkMakerSpaces.py | 17 ++++----- 18 files changed, 148 insertions(+), 124 deletions(-) diff --git a/advanced/accumulator/StackOverFlowSurvey.py b/advanced/accumulator/StackOverFlowSurvey.py index be9b0b3a..6a54b44b 100644 --- a/advanced/accumulator/StackOverFlowSurvey.py +++ b/advanced/accumulator/StackOverFlowSurvey.py @@ -1,25 +1,24 @@ -from pyspark import SparkContext +import sys +sys.path.insert(0, '.') +from pyspark import SparkContext, SparkConf from commons.Utils import Utils -def filterResponseFromCanada(response, total, missingSalaryMidPoint): - splits = Utils.COMMA_DELIMITER.split(response) - total.add(1) - if not splits[14]: - missingSalaryMidPoint.add(1) - return splits[2] == "Canada" - if __name__ == "__main__": - sc = SparkContext("local", "StackOverFlowSurvey") - sc.setLogLevel("ERROR") - + conf = SparkConf().setAppName('StackOverFlowSurvey').setMaster("local[*]") + sc = SparkContext(conf = conf) total = sc.accumulator(0) missingSalaryMidPoint = sc.accumulator(0) - responseRDD = sc.textFile("in/2016-stack-overflow-survey-responses.csv") - responseFromCanada = responseRDD.filter(lambda response: \ - filterResponseFromCanada(response, total, missingSalaryMidPoint)) + def filterResponseFromCanada(response): + splits = Utils.COMMA_DELIMITER.split(response) + total.add(1) + if not splits[14]: + missingSalaryMidPoint.add(1) + return splits[2] == "Canada" + responseFromCanada = responseRDD.filter(filterResponseFromCanada) print("Count of responses from Canada: {}".format(responseFromCanada.count())) print("Total count of responses: {}".format(total.value)) - print("Count of responses missing salary middle point: {}".format(missingSalaryMidPoint.value)) + print("Count of responses missing salary middle point: {}" \ + .format(missingSalaryMidPoint.value)) diff --git a/advanced/accumulator/StackOverFlowSurveyFollowUp.py b/advanced/accumulator/StackOverFlowSurveyFollowUp.py index 8db80d1f..03f3909a 100644 --- a/advanced/accumulator/StackOverFlowSurveyFollowUp.py +++ b/advanced/accumulator/StackOverFlowSurveyFollowUp.py @@ -1,26 +1,25 @@ -from pyspark import SparkContext +import sys +sys.path.insert(0, '.') +from pyspark import SparkContext, SparkConf from commons.Utils import Utils -def filterResponseFromCanada(response, total, missingSalaryMidPoint, processedBytes): - processedBytes.add(len(response.encode('utf-8'))) - splits = Utils.COMMA_DELIMITER.split(response) - total.add(1) - if not splits[14]: - missingSalaryMidPoint.add(1) - return splits[2] == "Canada" - if __name__ == "__main__": - sc = SparkContext("local", "StackOverFlowSurvey") - sc.setLogLevel("ERROR") + conf = SparkConf().setAppName('StackOverFlowSurvey').setMaster("local[*]") + sc = SparkContext(conf = conf) total = sc.accumulator(0) missingSalaryMidPoint = sc.accumulator(0) processedBytes = sc.accumulator(0) - responseRDD = sc.textFile("in/2016-stack-overflow-survey-responses.csv") - responseFromCanada = responseRDD.filter(lambda response: \ - filterResponseFromCanada(response, total, missingSalaryMidPoint, processedBytes)) + def filterResponseFromCanada(response): + processedBytes.add(len(response.encode('utf-8'))) + splits = Utils.COMMA_DELIMITER.split(response) + total.add(1) + if not splits[14]: + missingSalaryMidPoint.add(1) + return splits[2] == "Canada" + responseFromCanada = responseRDD.filter(filterResponseFromCanada) print("Count of responses from Canada: {}".format(responseFromCanada.count())) print("Number of bytes processed: {}".format(processedBytes.value)) diff --git a/advanced/broadcast/UkMakerSpaces.py b/advanced/broadcast/UkMakerSpaces.py index 7cc172d0..9323590e 100644 --- a/advanced/broadcast/UkMakerSpaces.py +++ b/advanced/broadcast/UkMakerSpaces.py @@ -1,19 +1,21 @@ -from pyspark import SparkContext +import sys +sys.path.insert(0, '.') +from pyspark import SparkContext, SparkConf from commons.Utils import Utils -def getPostPrefix(line: str): - splits = Utils.COMMA_DELIMITER.split(line) - postcode = splits[4] - return None if not postcode else postcode.split(" ")[0] - def loadPostCodeMap(): lines = open("in/uk-postcode.csv", "r").read().split("\n") splitsForLines = [Utils.COMMA_DELIMITER.split(line) for line in lines if line != ""] return {splits[0]: splits[7] for splits in splitsForLines} +def getPostPrefix(line: str): + splits = Utils.COMMA_DELIMITER.split(line) + postcode = splits[4] + return None if not postcode else postcode.split(" ")[0] + if __name__ == "__main__": - sc = SparkContext("local", "UkMakerSpaces") - sc.setLogLevel("ERROR") + conf = SparkConf().setAppName('UkMakerSpaces').setMaster("local[*]") + sc = SparkContext(conf = conf) postCodeMap = sc.broadcast(loadPostCodeMap()) diff --git a/advanced/broadcast/UkMakerSpacesWithoutBroadcast.py b/advanced/broadcast/UkMakerSpacesWithoutBroadcast.py index 4854f417..17e79d36 100644 --- a/advanced/broadcast/UkMakerSpacesWithoutBroadcast.py +++ b/advanced/broadcast/UkMakerSpacesWithoutBroadcast.py @@ -1,26 +1,28 @@ -from pyspark import SparkContext +import sys +sys.path.insert(0, '.') +from pyspark import SparkContext, SparkConf from commons.Utils import Utils -def getPostPrefixes(line: str): - postcode = Utils.COMMA_DELIMITER.split(line)[4] - cleanedPostCode = postcode.replace("\\s+", "") - return [cleanedPostCode[0:i] for i in range(0,len(cleanedPostCode)+1)] - def loadPostCodeMap(): lines = open("in/uk-postcode.csv", "r").read().split("\n") splitsForLines = [Utils.COMMA_DELIMITER.split(line) for line in lines if line != ""] return {splits[0]: splits[7] for splits in splitsForLines} +def getPostPrefix(line: str): + splits = Utils.COMMA_DELIMITER.split(line) + postcode = splits[4] + return None if not postcode else postcode.split(" ")[0] + if __name__ == "__main__": - sc = SparkContext("local", "UkMakerSpaces") - sc.setLogLevel("ERROR") + conf = SparkConf().setAppName('UkMakerSpaces').setMaster("local[*]") + sc = SparkContext(conf = conf) postCodeMap = loadPostCodeMap() makerSpaceRdd = sc.textFile("in/uk-makerspaces-identifiable-data.csv") regions = makerSpaceRdd \ .filter(lambda line: Utils.COMMA_DELIMITER.split(line)[0] != "Timestamp") \ - .map(lambda line: next((postCodeMap[prefix] for prefix in getPostPrefixes(line) \ - if prefix in postCodeMap), "Unknow")) + .map(lambda line: postCodeMap[getPostPrefix(line)] \ + if getPostPrefix(line) in postCodeMap else "Unknow") for region, count in regions.countByValue().items(): print("{} : {}".format(region, count)) diff --git a/pairRdd/aggregation/combinebykey/AverageHousePriceSolution.py b/pairRdd/aggregation/combinebykey/AverageHousePriceSolution.py index 4885ccbe..00b82eba 100644 --- a/pairRdd/aggregation/combinebykey/AverageHousePriceSolution.py +++ b/pairRdd/aggregation/combinebykey/AverageHousePriceSolution.py @@ -1,9 +1,8 @@ -from pyspark import SparkContext +from pyspark import SparkContext, SparkConf if __name__ == "__main__": - - sc = SparkContext("local", "AverageHousePrice") - sc.setLogLevel("ERROR") + conf = SparkConf().setAppName("AverageHousePrice").setMaster("local") + sc = SparkContext(conf = conf) lines = sc.textFile("in/RealEstate.csv") cleanedLines = lines.filter(lambda line: "Bedrooms" not in line) diff --git a/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.py b/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.py index acb633e2..69ecc9ae 100644 --- a/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.py +++ b/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.py @@ -1,24 +1,26 @@ -from pyspark import SparkContext +import sys +sys.path.insert(0, '.') +from pyspark import SparkContext, SparkConf +from pairRdd.aggregation.reducebykey.housePrice.AvgCount import AvgCount if __name__ == "__main__": - - sc = SparkContext("local", "avgHousePrice") - sc.setLogLevel("ERROR") + conf = SparkConf().setAppName("avgHousePrice").setMaster("local[3]") + sc = SparkContext(conf = conf) lines = sc.textFile("in/RealEstate.csv") cleanedLines = lines.filter(lambda line: "Bedrooms" not in line) housePricePairRdd = cleanedLines.map(lambda line: \ - (line.split(",")[3], (1, float(line.split(",")[2])))) + (line.split(",")[3], AvgCount(1, float(line.split(",")[2])))) housePriceTotal = housePricePairRdd \ - .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) + .reduceByKey(lambda x, y: AvgCount(x.count + y.count, x.total + y.total)) print("housePriceTotal: ") - for bedroom, total in housePriceTotal.collect(): - print("{} : {}".format(bedroom, total)) + for bedroom, avgCount in housePriceTotal.collect(): + print("{} : ({}, {})".format(bedroom, avgCount.count, avgCount.total)) - housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount[1] / avgCount[0]) + housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount.total / avgCount.count) print("\nhousePriceAvg: ") for bedroom, avg in housePriceAvg.collect(): print("{} : {}".format(bedroom, avg)) diff --git a/pairRdd/filter/AirportsNotInUsaSolution.py b/pairRdd/filter/AirportsNotInUsaSolution.py index fd9c2fe9..e30b4835 100644 --- a/pairRdd/filter/AirportsNotInUsaSolution.py +++ b/pairRdd/filter/AirportsNotInUsaSolution.py @@ -1,10 +1,12 @@ -from pyspark import SparkContext +import sys +sys.path.insert(0, '.') +from pyspark import SparkContext, SparkConf from commons.Utils import Utils if __name__ == "__main__": - sc = SparkContext("local", "airports") - sc.setLogLevel("ERROR") + conf = SparkConf().setAppName("airports").setMaster("local[*]") + sc = SparkContext(conf = conf) airportsRDD = sc.textFile("in/airports.text") diff --git a/pairRdd/groupbykey/AirportsByCountrySolution.py b/pairRdd/groupbykey/AirportsByCountrySolution.py index 0cc7017e..a5670f6c 100644 --- a/pairRdd/groupbykey/AirportsByCountrySolution.py +++ b/pairRdd/groupbykey/AirportsByCountrySolution.py @@ -1,10 +1,12 @@ -from pyspark import SparkContext +import sys +sys.path.insert(0, '.') +from pyspark import SparkContext, SparkConf from commons.Utils import Utils if __name__ == "__main__": - sc = SparkContext("local", "airports") - sc.setLogLevel("ERROR") + conf = SparkConf().setAppName("airports").setMaster("local[*]") + sc = SparkContext(conf = conf) lines = sc.textFile("in/airports.text") @@ -15,4 +17,4 @@ airportsByCountry = countryAndAirportNameAndPair.groupByKey() for country, airportName in airportsByCountry.collectAsMap().items(): - print("{}: {}".format(country,list(airportName))) + print("{}: {}".format(country, list(airportName))) diff --git a/pairRdd/mapValues/AirportsUppercaseSolution.py b/pairRdd/mapValues/AirportsUppercaseSolution.py index 3c9fa201..fded76e7 100644 --- a/pairRdd/mapValues/AirportsUppercaseSolution.py +++ b/pairRdd/mapValues/AirportsUppercaseSolution.py @@ -1,10 +1,11 @@ -from pyspark import SparkContext +import sys +sys.path.insert(0, '.') +from pyspark import SparkContext, SparkConf from commons.Utils import Utils if __name__ == "__main__": - - sc = SparkContext("local", "airports") - sc.setLogLevel("ERROR") + conf = SparkConf().setAppName("airports").setMaster("local[*]") + sc = SparkContext(conf = conf) airportsRDD = sc.textFile("in/airports.text") diff --git a/pairRdd/sort/AverageHousePriceSolution.py b/pairRdd/sort/AverageHousePriceSolution.py index 4306fdbc..1fda400e 100644 --- a/pairRdd/sort/AverageHousePriceSolution.py +++ b/pairRdd/sort/AverageHousePriceSolution.py @@ -1,11 +1,11 @@ +import sys +sys.path.insert(0, '.') from pairRdd.aggregation.reducebykey.housePrice.AvgCount import AvgCount -from pyspark import SparkContext - +from pyspark import SparkContext, SparkConf if __name__ == "__main__": - - sc = SparkContext("local", "averageHousePriceSolution") - sc.setLogLevel("ERROR") + conf = SparkConf().setAppName("averageHousePriceSolution").setMaster("local[*]") + sc = SparkContext(conf = conf) lines = sc.textFile("in/RealEstate.csv") cleanedLines = lines.filter(lambda line: "Bedrooms" not in line) diff --git a/rdd/airports/AirportsByLatitudeSolution.py b/rdd/airports/AirportsByLatitudeSolution.py index 2ce476d5..70c1db13 100644 --- a/rdd/airports/AirportsByLatitudeSolution.py +++ b/rdd/airports/AirportsByLatitudeSolution.py @@ -1,4 +1,6 @@ -from pyspark import SparkContext +import sys +sys.path.insert(0, '.') +from pyspark import SparkContext, SparkConf from commons.Utils import Utils def splitComma(line: str): @@ -6,7 +8,8 @@ def splitComma(line: str): return "{}, {}".format(splits[1], splits[6]) if __name__ == "__main__": - sc = SparkContext("local", "airports") + conf = SparkConf().setAppName("airports").setMaster("local[*]") + sc = SparkContext(conf = conf) airports = sc.textFile("in/airports.text") diff --git a/rdd/airports/AirportsInUsaSolution.py b/rdd/airports/AirportsInUsaSolution.py index 96ec733b..1fa72b57 100644 --- a/rdd/airports/AirportsInUsaSolution.py +++ b/rdd/airports/AirportsInUsaSolution.py @@ -1,4 +1,6 @@ -from pyspark import SparkContext +import sys +sys.path.insert(0, '.') +from pyspark import SparkContext, SparkConf from commons.Utils import Utils def splitComma(line: str): @@ -6,7 +8,8 @@ def splitComma(line: str): return "{}, {}".format(splits[1], splits[2]) if __name__ == "__main__": - sc = SparkContext("local", "airports") + conf = SparkConf().setAppName("airports").setMaster("local[*]") + sc = SparkContext(conf = conf) airports = sc.textFile("in/airports.text") airportsInUSA = airports.filter(lambda line : Utils.COMMA_DELIMITER.split(line)[3] == "\"United States\"") diff --git a/rdd/count/CountExample.py b/rdd/count/CountExample.py index eb53dbc3..54cd69a2 100644 --- a/rdd/count/CountExample.py +++ b/rdd/count/CountExample.py @@ -3,9 +3,12 @@ if __name__ == "__main__": conf = SparkConf().setAppName("count").setMaster("local[*]") sc = SparkContext(conf = conf) + inputWords = ["spark", "hadoop", "spark", "hive", "pig", "cassandra", "hadoop"] + wordRdd = sc.parallelize(inputWords) print("Count: {}".format(wordRdd.count())) + worldCountByValue = wordRdd.countByValue() print("CountByValue: ") for word, count in worldCountByValue.items(): diff --git a/sparkSql/HousePriceProblem.py b/sparkSql/HousePriceProblem.py index a5f88bfa..3122c3eb 100644 --- a/sparkSql/HousePriceProblem.py +++ b/sparkSql/HousePriceProblem.py @@ -4,20 +4,21 @@ Create a Spark program to read the house data from in/RealEstate.csv, group by location, aggregate the average price per SQ Ft and sort by average price per SQ Ft. - The houses dataset contains a collection of recent real estate listings in San Luis Obispo county and - around it.  + The houses dataset contains a collection of recent real estate listings in  + San Luis Obispo county and around it.  The dataset contains the following fields: 1. MLS: Multiple listing service number for the house (unique ID). - 2. Location: city/town where the house is located. Most locations are in San Luis Obispo county and - northern Santa Barbara county (Santa Maria­Orcutt, Lompoc, Guadelupe, Los Alamos), but there - some out of area locations as well. + 2. Location: city/town where the house is located. Most locations are in  + San Luis Obispo county and northern Santa Barbara county (Santa Maria­Orcutt, Lompoc,  + Guadelupe, Los Alamos), but there some out of area locations as well. 3. Price: the most recent listing price of the house (in dollars). 4. Bedrooms: number of bedrooms. 5. Bathrooms: number of bathrooms. 6. Size: size of the house in square feet. 7. Price/SQ.ft: price of the house per square foot. - 8. Status: type of sale. Thee types are represented in the dataset: Short Sale, Foreclosure and Regular. + 8. Status: type of sale. Thee types are represented in the dataset: Short Sale,  + Foreclosure and Regular. Each field is comma separated. diff --git a/sparkSql/HousePriceSolution.py b/sparkSql/HousePriceSolution.py index d6982f4c..cf4f2e28 100644 --- a/sparkSql/HousePriceSolution.py +++ b/sparkSql/HousePriceSolution.py @@ -4,8 +4,8 @@ if __name__ == "__main__": - session = SparkSession.builder.appName("HousePriceSolution").master("local").getOrCreate() - session.sparkContext.setLogLevel("ERROR") + session = SparkSession.builder.appName("HousePriceSolution").master("local[*]").getOrCreate() + realEstate = session.read \ .option("header","true") \ .option("inferSchema", value=True) \ diff --git a/sparkSql/RddDataframeConversion.py b/sparkSql/RddDataframeConversion.py index 76359eff..8b99156a 100644 --- a/sparkSql/RddDataframeConversion.py +++ b/sparkSql/RddDataframeConversion.py @@ -1,32 +1,33 @@ +import sys +sys.path.insert(0, '.') from pyspark.sql import SparkSession from commons.Utils import Utils -def getColNames(line: str): - splits = Utils.COMMA_DELIMITER.split(line) - return [splits[2], splits[6], splits[9], splits[14]] - def mapResponseRdd(line: str): splits = Utils.COMMA_DELIMITER.split(line) double1 = None if not splits[6] else float(splits[6]) double2 = None if not splits[14] else float(splits[14]) return splits[2], double1, splits[9], double2 +def getColNames(line: str): + splits = Utils.COMMA_DELIMITER.split(line) + return [splits[2], splits[6], splits[9], splits[14]] + if __name__ == "__main__": - session = SparkSession.builder.appName("StackOverFlowSurvey").master("local").getOrCreate() + session = SparkSession.builder.appName("StackOverFlowSurvey").master("local[*]").getOrCreate() sc = session.sparkContext - sc.setLogLevel("ERROR") lines = sc.textFile("in/2016-stack-overflow-survey-responses.csv") - colNames = lines \ - .filter(lambda line: Utils.COMMA_DELIMITER.split(line)[2] == "country") \ - .map(getColNames) - responseRDD = lines \ .filter(lambda line: not Utils.COMMA_DELIMITER.split(line)[2] == "country") \ .map(mapResponseRdd) + colNames = lines \ + .filter(lambda line: Utils.COMMA_DELIMITER.split(line)[2] == "country") \ + .map(getColNames) + responseDataFrame = responseRDD.toDF(colNames.collect()[0]) print("=== Print out schema ===") @@ -35,5 +36,5 @@ def mapResponseRdd(line: str): print("=== Print 20 records of responses table ===") responseDataFrame.show(20) - for response in responseDataFrame.rdd.collect(): - print(response) + for response in responseDataFrame.rdd.take(10): + print(response) \ No newline at end of file diff --git a/sparkSql/StackOverFlowSurvey.py b/sparkSql/StackOverFlowSurvey.py index e7e9d94a..0eec8657 100644 --- a/sparkSql/StackOverFlowSurvey.py +++ b/sparkSql/StackOverFlowSurvey.py @@ -6,39 +6,45 @@ if __name__ == "__main__": - session = SparkSession.builder.appName("StackOverFlowSurvey").master("local").getOrCreate() - session.sparkContext.setLogLevel("ERROR") + session = SparkSession.builder.appName("StackOverFlowSurvey").getOrCreate() + sc = session.sparkContext + sc.setLogLevel('ERROR') + dataFrameReader = session.read responses = dataFrameReader \ .option("header", "true") \ .option("inferSchema", value = True) \ - .csv("in/2016-stack-overflow-survey-responses.csv") - + .csv("s3n://stackoverflow-analytics-pedro/2016-stack-overflow-survey-responses.csv") + print("=== Print out schema ===") responses.printSchema() - responseWithSelectedColumns = responses.select("country", "occupation", AGE_MIDPOINT, SALARY_MIDPOINT) + responseWithSelectedColumns = responses.select("country", "occupation", + AGE_MIDPOINT, SALARY_MIDPOINT) print("=== Print the selected columns of the table ===") responseWithSelectedColumns.show() print("=== Print records where the response is from Afghanistan ===") - responseWithSelectedColumns.filter(responseWithSelectedColumns["country"] == "Afghanistan").show() + responseWithSelectedColumns\ + .filter(responseWithSelectedColumns["country"] == "Afghanistan").show() print("=== Print the count of occupations ===") - groupedDataset = responseWithSelectedColumns.groupBy("occupation") - groupedDataset.count().show() + groupedData = responseWithSelectedColumns.groupBy("occupation") + groupedData.count().show() print("=== Print records with average mid age less than 20 ===") - responseWithSelectedColumns.filter(responseWithSelectedColumns[AGE_MIDPOINT] < 20).show() + responseWithSelectedColumns\ + .filter(responseWithSelectedColumns[AGE_MIDPOINT] < 20).show() print("=== Print the result by salary middle point in descending order ===") - responseWithSelectedColumns.orderBy(responseWithSelectedColumns[SALARY_MIDPOINT], ascending=False).show() + responseWithSelectedColumns\ + .orderBy(responseWithSelectedColumns[SALARY_MIDPOINT], ascending = False).show() print("=== Group by country and aggregate by average salary middle point ===") - datasetGroupByCountry = responseWithSelectedColumns.groupBy("country") - datasetGroupByCountry.avg(SALARY_MIDPOINT).show() + dataGroupByCountry = responseWithSelectedColumns.groupBy("country") + dataGroupByCountry.avg(SALARY_MIDPOINT).show() responseWithSalaryBucket = responses.withColumn(SALARY_MIDPOINT_BUCKET, ((responses[SALARY_MIDPOINT]/20000).cast("integer")*20000)) @@ -47,6 +53,10 @@ responseWithSalaryBucket.select(SALARY_MIDPOINT, SALARY_MIDPOINT_BUCKET).show() print("=== Group by salary bucket ===") - responseWithSalaryBucket.groupBy(SALARY_MIDPOINT_BUCKET).count().orderBy(SALARY_MIDPOINT_BUCKET).show() + responseWithSalaryBucket \ + .groupBy(SALARY_MIDPOINT_BUCKET) \ + .count() \ + .orderBy(SALARY_MIDPOINT_BUCKET) \ + .show() session.stop() diff --git a/sparkSql/join/UkMakerSpaces.py b/sparkSql/join/UkMakerSpaces.py index 446cbf6c..f1f8a526 100644 --- a/sparkSql/join/UkMakerSpaces.py +++ b/sparkSql/join/UkMakerSpaces.py @@ -1,27 +1,22 @@ from pyspark.sql import SparkSession, functions as fs if __name__ == "__main__": + session = SparkSession.builder.appName("UkMakerSpaces").master("local[*]").getOrCreate() - session = SparkSession.builder.appName("UkMakerSpaces").master("local").getOrCreate() - sc = session.sparkContext - sc.setLogLevel("ERROR") - - makerSpace = session.read \ - .option("header", "true") \ + makerSpace = session.read.option("header", "true") \ .csv("in/uk-makerspaces-identifiable-data.csv") - postCode = session.read \ - .option("header", "true") \ - .csv("in/uk-postcode.csv") \ + postCode = session.read.option("header", "true").csv("in/uk-postcode.csv") \ .withColumn("PostCode", fs.concat_ws("", fs.col("PostCode"), fs.lit(" "))) print("=== Print 20 records of makerspace table ===") makerSpace.select("Name of makerspace", "Postcode").show() print("=== Print 20 records of postcode table ===") - postCode.show() + postCode.select("PostCode", "Region").show() - joined = makerSpace.join(postCode, makerSpace["Postcode"].startswith(postCode["Postcode"]), "left_outer") + joined = makerSpace \ + .join(postCode, makerSpace["Postcode"].startswith(postCode["Postcode"]), "left_outer") print("=== Group by Region ===") joined.groupBy("Region").count().show(200) \ No newline at end of file From ac8e586009799e1aedd074dd850253e8b97033d2 Mon Sep 17 00:00:00 2001 From: Pedro Bernardo Date: Sun, 4 Feb 2018 03:51:22 -0800 Subject: [PATCH 2/2] Reverted sparkSql/StackOverflowSurvey to local version --- sparkSql/StackOverFlowSurvey.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sparkSql/StackOverFlowSurvey.py b/sparkSql/StackOverFlowSurvey.py index 0eec8657..a27c7eb9 100644 --- a/sparkSql/StackOverFlowSurvey.py +++ b/sparkSql/StackOverFlowSurvey.py @@ -7,15 +7,13 @@ if __name__ == "__main__": session = SparkSession.builder.appName("StackOverFlowSurvey").getOrCreate() - sc = session.sparkContext - sc.setLogLevel('ERROR') - + dataFrameReader = session.read responses = dataFrameReader \ .option("header", "true") \ .option("inferSchema", value = True) \ - .csv("s3n://stackoverflow-analytics-pedro/2016-stack-overflow-survey-responses.csv") + .csv("in/2016-stack-overflow-survey-responses.csv") print("=== Print out schema ===") responses.printSchema() pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy