import sys import re from pyspark import SparkContext def flat_map(document): """Returns back all of the words in LINE """ return re.findall(r"\w+", document[1]) def map(word): """ Create pairs where the word is the key and 1 is the value """ return (word, 1) def reduce(a, b): """ Add up the counts for a word """ return a + b def wordcount(file_name, output="spark-wc-out-wordcount"): sc = SparkContext("local[8]", "WordCount") """ Reads in a sequence file FILE_NAME to be manipulated """ file = sc.sequenceFile(file_name) """ - flatMap takes in a function that will take one input and outputs 0 or more items - map takes in a function that will take one input and outputs a single item - reduceByKey takes in a function, groups the dataset by keys and aggregates the values of each key """ counts = file.flatMap(flat_map) \ .map(map) \ .reduceByKey(reduce) """ Takes the dataset stored in counts and writes everything out to OUTPUT """ counts.coalesce(1).saveAsTextFile(output) """ Do not worry about this """ if __name__ == "__main__": argv = sys.argv if len(argv) == 2: wordcount(argv[1]) else: wordcount(argv[1], argv[2])