mapとかlambdaを駆使してみた。やりすぎかもしれない。ポイントはファイルの各行を別々にsplitしたものをマージするタイミングかもしれない。ここでは、一気に全部やってしまっている。
import string, sys, re def wordcount_str(line): words = filter( None, #same as lambda x: x, re.compile(r'\W+').split(line) )#line.split()) # ()[]{}<>=,. return map(None, words, [1]*len(words)) def wordcount_file(filename): '''file->line->word''' f = open(filename, 'r') try: return filter( lambda x: x, map(wordcount_str, f.readlines())) finally: f.close() def wordcount( files ): '''input file list and returns a word count list.''' count_lists = map( wordcount_file,files) merge = lambda l1, l2: l1+l2 #merging lists return multi_reduce( lambda x, y: x+y, #adding ints reduce(merge, reduce(merge, count_lists))) def multi_reduce(reduce_func, mid_data): mid = {} for t in mid_data: #print t, len(t) if( t[0] in mid ): mid[t[0]] = reduce_func( mid[t[0]], t[1] ) else: mid[t[0]] = t[1] return mid.items() if __name__=='__main__': l = wordcount( sys.argv ) print reduce(lambda x, y: x+y, [x[1] for x in l]), 'tokens.' adict = {} adict.update( filter( lambda x: (x[1]>1), l) ) #http://blog.modp.com/2007/11/sorting-python-dict-by-value.html alist = sorted(adict.items(), key=lambda (k,v): (v,k)) n = 10 print 'top %s: %s' % (n, alist[-n:]) # top 5