import sys import re from pyspark import SparkContext def flat_map(document): """ Takes in document, which is a key, value pair, where document[0] is the document ID and document[1] is the contents of the document. You need to keep track of three things, word, document ID, and the index inside of the document, but you are working with key, value pairs. Is there a way to combine these three things and make a key, value pair? HINT: Since you need indices, consider iterating over the list returned by re.findall() with an explicit index variable. The list returned by re.findall() is in the same order as the original text. """ """ Replace or modify this function. """ return re.findall(r"\w+", document[1]) def map(arg): """ Replace or modify this function. """ return (arg, arg) def reduce(arg1, arg2): """ Replace or modify this function. """ # HINT: In Python, if you want to convert a number n to its string representation, # you can just write str(n). You can also concatenate strings with the + operator. return arg1 def index(file_name, output="spark-wc-out-index"): sc = SparkContext("local[8]", "Index") file = sc.sequenceFile(file_name) # Same message as last exercise: Feel free to modify # this structure so it suits your functions better (and # so that it satisfies the requirements). indices = file.flatMap(flat_map) \ .map(map) \ .reduceByKey(reduce) indices.coalesce(1).saveAsTextFile(output) """ Do not worry about this """ if __name__ == "__main__": argv = sys.argv if len(argv) == 2: index(argv[1]) else: index(argv[1], argv[2])