CoffeeScript MapReduce

While reading through my new copy of Data Science from Scratch (which is a great introduction to machine learning), I came upon the chapter on MapReduce. CoffeeScript would do the job nicely. So, here is an annotated version. It is single machine, single core (which is not terribly useful), but it demonstrates the concepts.

# This takes as input a list of items, a mapper function and a reducer function
# 
#   The mapper function takes a single item as an input, and returns an object of key / value pairs
#   The reducer function takes a key value pair and returns an arbitrary item (often a key and value)
#
exports.map_reduce = map_reduce = (items, mapper, reducer) ->  
  collector = {}

  for item in items
    for k, v of mapper(item)
      collector[k] = [] unless collector?[k]
      collector[k].push v

  (reducer(k, v) for k, v of collector)

Word count example

# sum items in a list
sum = (list) -> list.reduce (a,b) -> a + b

# this specific mapper splits its input on white space and counts the number of occurrences of a word
wc_mapper = (item) ->  
  wc = {}
  wc[token] = (wc[token] or 0) + 1 for token in item.split /\W+/g
  wc

# this specific reducer sums the list of values and returns the key and value in a list
wc_reducer = (k, v) -> [k, sum(v)]

# test it out
a = map_reduce ['coke 2l container', 'pepsi max 355ml container', 'doctor pepper 2l'], wc_mapper, wc_reducer

console.log a  

Output

[ [ 'coke', 1 ],
  [ '2l', 2 ],
  [ 'container', 2 ],
  [ 'pepsi', 1 ],
  [ 'max', 1 ],
  [ '355ml', 1 ],
  [ 'doctor', 1 ],
  [ 'pepper', 1 ] ]

Output from one MapReduce job becomes input for another

# the identity reducer
wc_ident = (k, v) -> [k, v]

# reverse mapper (counts become keys, words become values)
rev_mapper = (item) ->  
  wc = {}
  wc[item[1]] = item[0]
  wc

# test it out
b = map_reduce a, rev_mapper, wc_ident

console.log b  

Output

[ [ '1', [ 'coke', 'pepsi', 'max', '355ml', 'doctor', 'pepper' ] ],
  [ '2', [ '2l', 'container' ] ] ]