import sys
import re

from pyspark import SparkContext

def flat_map(document):
    """
    Takes in document, which is a key, value pair, where document[0] is the
    document ID and document[1] is the contents of the document.

    You need to keep track of three things, word, document ID, and the
    index inside of the document, but you are working with key, value pairs.
    Is there a way to combine these three things and make a key, value pair?

    HINT: Since you need indices, consider iterating over the list returned by
    re.findall() with an explicit index variable. The list returned by re.findall()
    is in the same order as the original text.
    """

    """ Replace or modify this function. """
    return re.findall(r"\w+", document[1])

def map(arg):
    """ Replace or modify this function. """
    return (arg, arg)

def reduce(arg1, arg2):
    """ Replace or modify this function. """

    # HINT: In Python, if you want to convert a number n to its string representation,
    # you can just write str(n). You can also concatenate strings with the + operator.

    return arg1

def index(file_name, output="spark-wc-out-index"):
    sc = SparkContext("local[8]", "Index")
    file = sc.sequenceFile(file_name)

    # Same message as last exercise: Feel free to modify 
    # this structure so it suits your functions better (and
    # so that it satisfies the requirements).
    
    indices = file.flatMap(flat_map) \
                  .map(map) \
                  .reduceByKey(reduce)

    indices.coalesce(1).saveAsTextFile(output)

""" Do not worry about this """
if __name__ == "__main__":
    argv = sys.argv
    if len(argv) == 2:
        index(argv[1])
    else:
        index(argv[1], argv[2])