soimort/Grep.hs

## Grep.hs
module Main where

import Data.Time
import Debug.Trace

import Data.Char hiding (empty)
import Data.List (findIndex)
import Data.Map hiding (findIndex, foldl)
import Text.ParserCombinators.ReadP

dataset = "mxm_dataset_test.txt"
target  = "devil"

-- Build the inverted index.
invertedIndex =
  foldl (\ acc1 (_, mxmId, stat) ->
           foldl (\ acc2 (wordIndex, _) ->
                    insertWith (++) wordIndex [mxmId] acc2) acc1 stat) empty

-- Grep a word in the inverted index.
grep word words invIndex =
  case findIndex (== word) words of
  Just i  -> Data.Map.lookup (show $ i + 1) invIndex -- starts from 1
  Nothing -> error "word not found"

-- MusiXmatch parser

newline = char '\n'

skipComment = do
  char '#'
  munch (/= '\n')
  newline

wordName = do
  x <- munch1 $ \x -> x /= ',' && x /= '\n'
  many $ char ','
  return x

topWords = char '%' >> manyTill wordName newline

wordStat = do
  wordIndex <- munch1 isDigit
  char ':'
  wordCount <- munch1 isDigit
  many $ char ','
  return (wordIndex, wordCount)

track = do
  trackId <- munch1 (/= ',')
  char ','
  mxmId <- munch1 isDigit
  char ','
  stat <- manyTill wordStat newline
  return (trackId, mxmId, stat)

mxm = do
  many skipComment
  words <- topWords
  tracks <- manyTill track eof
  return (words, tracks)

parseText text = do
  case readP_to_S mxm text of
    [(v, _)] -> v
    _        -> error "file not parsed"

main = do
  timeStart <- getCurrentTime
  content <- readFile dataset
  let (words, tracks) = parseText content
  putStrLn $ "Words:   " ++ show (length words)
  putStrLn $ "Tracks:  " ++ show (length tracks)
  timeFinishParsing <- getCurrentTime
  putStrLn $ "Time on parsing:  " ++
    show (diffUTCTime timeFinishParsing timeStart)

  let invIndex = invertedIndex tracks
  putStrLn $ "Indexed: " ++ show (size invIndex)
  timeFinishIndexing <- getCurrentTime
  putStrLn $ "Time on indexing: " ++
    show (diffUTCTime timeFinishIndexing timeFinishParsing)

  case grep target words invIndex of
    Nothing     -> putStrLn "word not found"
    Just result -> putStrLn $ "Number of tracks with that word: " ++
                   show (length result)
  timeFinishGrepping <- getCurrentTime
  putStrLn $ "Time on grepping: " ++
    show (diffUTCTime timeFinishGrepping timeFinishIndexing)

## test_seq.erl
-module(test_seq).
-mode(compile).

-export([inverted_index/1, grep/3, main/1]).

-define(DATASET, "mxm_dataset_test.txt").
-define(TARGET,  "devil").

inverted_index(Tracks) ->
    lists:foldl
      (fun(Track, Acc) ->
               {_, MxmId, TrackWords} = read_mxm:parse_track(Track),
               lists:foldl(fun({WordIndex, _}, Acc2) ->
                                   dict:append(WordIndex, MxmId, Acc2)
                           end,
                           Acc, TrackWords)
       end,
       dict:new(), Tracks).

grep(Word, Words, InvIndex) ->
    dict:find(string:str(Words, [Word]), InvIndex).

main(_) ->
    compile:file(read_mxm),
    {Words, Tracks} = read_mxm:from_file(?DATASET),
    {Time, InvIndex} = timer:tc(test_seq, inverted_index, [Tracks]),
    io:format("Time on indexing: ~ps~n", [Time * 0.000001]),
    case grep(?TARGET, Words, InvIndex) of
        {ok, Result} ->
            io:format("Number of tracks with that word: ~p~n",
                      [length(Result)]);
        _            -> io:format("word not found~n")
    end.
	module Main where

	import Data.Time
	import Debug.Trace

	import Data.Char hiding (empty)
	import Data.List (findIndex)
	import Data.Map hiding (findIndex, foldl)
	import Text.ParserCombinators.ReadP

	dataset = "mxm_dataset_test.txt"
	target = "devil"

	-- Build the inverted index.
	invertedIndex =
	foldl (\ acc1 (_, mxmId, stat) ->
	foldl (\ acc2 (wordIndex, _) ->
	insertWith (++) wordIndex [mxmId] acc2) acc1 stat) empty

	-- Grep a word in the inverted index.
	grep word words invIndex =
	case findIndex (== word) words of
	Just i -> Data.Map.lookup (show $ i + 1) invIndex -- starts from 1
	Nothing -> error "word not found"

	-- MusiXmatch parser

	newline = char '\n'

	skipComment = do
	char '#'
	munch (/= '\n')
	newline

	wordName = do
	x <- munch1 $ \x -> x /= ',' && x /= '\n'
	many $ char ','
	return x

	topWords = char '%' >> manyTill wordName newline

	wordStat = do
	wordIndex <- munch1 isDigit
	char ':'
	wordCount <- munch1 isDigit
	many $ char ','
	return (wordIndex, wordCount)

	track = do
	trackId <- munch1 (/= ',')
	char ','
	mxmId <- munch1 isDigit
	char ','
	stat <- manyTill wordStat newline
	return (trackId, mxmId, stat)

	mxm = do
	many skipComment
	words <- topWords
	tracks <- manyTill track eof
	return (words, tracks)

	parseText text = do
	case readP_to_S mxm text of
	[(v, _)] -> v
	_ -> error "file not parsed"

	main = do
	timeStart <- getCurrentTime
	content <- readFile dataset
	let (words, tracks) = parseText content
	putStrLn $ "Words: " ++ show (length words)
	putStrLn $ "Tracks: " ++ show (length tracks)
	timeFinishParsing <- getCurrentTime
	putStrLn $ "Time on parsing: " ++
	show (diffUTCTime timeFinishParsing timeStart)

	let invIndex = invertedIndex tracks
	putStrLn $ "Indexed: " ++ show (size invIndex)
	timeFinishIndexing <- getCurrentTime
	putStrLn $ "Time on indexing: " ++
	show (diffUTCTime timeFinishIndexing timeFinishParsing)

	case grep target words invIndex of
	Nothing -> putStrLn "word not found"
	Just result -> putStrLn $ "Number of tracks with that word: " ++
	show (length result)
	timeFinishGrepping <- getCurrentTime
	putStrLn $ "Time on grepping: " ++
	show (diffUTCTime timeFinishGrepping timeFinishIndexing)
	-module(test_seq).
	-mode(compile).

	-export([inverted_index/1, grep/3, main/1]).

	-define(DATASET, "mxm_dataset_test.txt").
	-define(TARGET, "devil").

	inverted_index(Tracks) ->
	lists:foldl
	(fun(Track, Acc) ->
	{_, MxmId, TrackWords} = read_mxm:parse_track(Track),
	lists:foldl(fun({WordIndex, _}, Acc2) ->
	dict:append(WordIndex, MxmId, Acc2)
	end,
	Acc, TrackWords)
	end,
	dict:new(), Tracks).

	grep(Word, Words, InvIndex) ->
	dict:find(string:str(Words, [Word]), InvIndex).

	main(_) ->
	compile:file(read_mxm),
	{Words, Tracks} = read_mxm:from_file(?DATASET),
	{Time, InvIndex} = timer:tc(test_seq, inverted_index, [Tracks]),
	io:format("Time on indexing: ~ps~n", [Time * 0.000001]),
	case grep(?TARGET, Words, InvIndex) of
	{ok, Result} ->
	io:format("Number of tracks with that word: ~p~n",
	[length(Result)]);
	_ -> io:format("word not found~n")
	end.