dimazest/produce.ini

## produce.ini
[]
bin = ../../bin
corpora = %{bin}/corpora
fowler_corpora_py = %{bin}/fowler.corpora-py

bnc_corpus = bnc+ccg://${PWD}/CCG_BNC_v1

# Get the wordsim353 similarity dataset
[data/wordsim353.csv]
recipe =
    mkdir -p data
    # Lowercase the words and replace "troops" with its stem "troop"
    # This transformation is needed because word sems will be used to extract co-occurences.
    curl -s https://bitbucket.org/dimazest/phd-buildout/raw/tip/notebooks/downloads/wordsim353/combined.csv \
    | tr '[:upper:]' '[:lower:]' | sed -e 's/troops/troop/g' \
    > %{target}

# Get the Rubenstein and Goodenough 65 similarity dataset
[data/rg65.csv]
recipe =
    mkdir -p data
    curl -s https://bitbucket.org/dimazest/phd-buildout/raw/tip/notebooks/downloads/RubensteinGoodenough/EN-RG-65.txt \
    > %{target}

# Extract all the words from the wordsim353 similarity dataset
[out/wordsim353_targets.csv]
dep.input = data/wordsim353.csv
recipe =
    mkdir -p out
    # Get the first colum
    cut %{input} -d, -f 1 > t
    # Append the second column
    cut %{input} -d, -f 2 >> t
    # The header
    echo ngram > %{target}
    # Get rid of duplicates and the column names ("word 1", "word 2")
    cat t | sort | uniq | grep -v 'word 1' | grep -v 'word 2'>> %{target}
    rm t

# Extract all the words from the Rubenstein and Goodenough 65 similarity dataset
[out/rg65_targets.csv]
dep.input = data/rg65.csv
recipe =
    mkdir -p out
    # Get the first colum
    cut %{input} -f 1 > t
    # Append the second column
    cut %{input} -f 2 >> t
    # The header
    echo ngram > %{target}
    cat t | sort | uniq >> %{target}
    rm t

# Count the word frequencies, POS tagged
[out/%{experiment}_dictionary-bnc-pos.h5]
dep.targets = out/%{experiment}_targets.csv
recipe =
    %{corpora} bnc dictionary \
    --corpus %{bnc_corpus} \
    -o %{target} \
    --stem \
    -v

# Count the word frequencies, without POS tags
[out/%{experiment}_dictionary-bnc-nopos.h5]
dep.targets = out/%{experiment}_targets.csv
recipe =
    %{corpora} bnc dictionary \
    --corpus %{bnc_corpus} \
    -o %{target} \
    --stem \
    --omit-tags \
    -v

# Select only certain words, as the target tagged words
[out/%{experiment}_contexts_%{params}-pos_c-all-%{c_start}-%{c_end}.csv]
dep.file = out/%{experiment}_dictionary-%{params}-pos.h5
recipe =
     %{fowler_corpora_py} -c "import pandas as pd; pd.read_hdf('%{file}', key='dictionary')[%{c_start}:%{c_end}][['ngram', 'tag']].to_csv('%{target}', index=False)"

# Select only certain words, as the target untagged words (without POS)
[out/%{experiment}_contexts_%{params}-nopos_c-all-%{c_start}-%{c_end}.csv]
dep.file = out/%{experiment}_dictionary-%{params}-nopos.h5
recipe =
     %{fowler_corpora_py} -c "import pandas as pd; pd.read_hdf('%{file}', key='dictionary')[['ngram']].drop_duplicates()[%{c_start}:%{c_end}].to_csv('%{target}', index=False)"

# Build the space
[out/%{experiment}_space_%{params}.h5]
dep.targets = out/%{experiment}_targets.csv
dep.context = out/%{experiment}_contexts_%{params}.csv
recipe =
    %{corpora} bnc cooccurrence \
    -t %{targets} \
    -c %{context} \
    --corpus %{bnc_corpus} \
    -o %{target} \
    --stem

# Run an experiment
[experiment_%{experiment}__%{params}]
type = task
dep.data = data/%{experiment}.csv
dep.space = out/%{experiment}_space_%{params}.h5
recipe =
    %{corpora} similarity %{experiment} \
    -s %{space} \
    --%{experiment}-data %{data}

[universe]
type = task
dep.a = experiment_wordsim353__bnc-pos_c-all-0-3101
dep.b = experiment_wordsim353__bnc-pos_c-all-101-3101
dep.c = experiment_wordsim353__bnc-nopos_c-all-0-3101
dep.d = experiment_wordsim353__bnc-nopos_c-all-101-3101
dep.e = experiment_rg65__bnc-pos_c-all-101-3101
dep.f = experiment_rg65__bnc-nopos_c-all-101-3101

[vacuum]
type = task
recipe =
    rm -rf out/ data/
	[]
	bin = ../../bin
	corpora = %{bin}/corpora
	fowler_corpora_py = %{bin}/fowler.corpora-py

	bnc_corpus = bnc+ccg://${PWD}/CCG_BNC_v1

	# Get the wordsim353 similarity dataset
	[data/wordsim353.csv]
	recipe =
	mkdir -p data
	# Lowercase the words and replace "troops" with its stem "troop"
	# This transformation is needed because word sems will be used to extract co-occurences.
	curl -s https://bitbucket.org/dimazest/phd-buildout/raw/tip/notebooks/downloads/wordsim353/combined.csv \
	\| tr '[:upper:]' '[:lower:]' \| sed -e 's/troops/troop/g' \
	> %{target}

	# Get the Rubenstein and Goodenough 65 similarity dataset
	[data/rg65.csv]
	recipe =
	mkdir -p data
	curl -s https://bitbucket.org/dimazest/phd-buildout/raw/tip/notebooks/downloads/RubensteinGoodenough/EN-RG-65.txt \
	> %{target}

	# Extract all the words from the wordsim353 similarity dataset
	[out/wordsim353_targets.csv]
	dep.input = data/wordsim353.csv
	recipe =
	mkdir -p out
	# Get the first colum
	cut %{input} -d, -f 1 > t
	# Append the second column
	cut %{input} -d, -f 2 >> t
	# The header
	echo ngram > %{target}
	# Get rid of duplicates and the column names ("word 1", "word 2")
	cat t \| sort \| uniq \| grep -v 'word 1' \| grep -v 'word 2'>> %{target}
	rm t

	# Extract all the words from the Rubenstein and Goodenough 65 similarity dataset
	[out/rg65_targets.csv]
	dep.input = data/rg65.csv
	recipe =
	mkdir -p out
	# Get the first colum
	cut %{input} -f 1 > t
	# Append the second column
	cut %{input} -f 2 >> t
	# The header
	echo ngram > %{target}
	cat t \| sort \| uniq >> %{target}
	rm t

	# Count the word frequencies, POS tagged
	[out/%{experiment}_dictionary-bnc-pos.h5]
	dep.targets = out/%{experiment}_targets.csv
	recipe =
	%{corpora} bnc dictionary \
	--corpus %{bnc_corpus} \
	-o %{target} \
	--stem \
	-v

	# Count the word frequencies, without POS tags
	[out/%{experiment}_dictionary-bnc-nopos.h5]
	dep.targets = out/%{experiment}_targets.csv
	recipe =
	%{corpora} bnc dictionary \
	--corpus %{bnc_corpus} \
	-o %{target} \
	--stem \
	--omit-tags \
	-v

	# Select only certain words, as the target tagged words
	[out/%{experiment}_contexts_%{params}-pos_c-all-%{c_start}-%{c_end}.csv]
	dep.file = out/%{experiment}_dictionary-%{params}-pos.h5
	recipe =
	%{fowler_corpora_py} -c "import pandas as pd; pd.read_hdf('%{file}', key='dictionary')[%{c_start}:%{c_end}][['ngram', 'tag']].to_csv('%{target}', index=False)"

	# Select only certain words, as the target untagged words (without POS)
	[out/%{experiment}_contexts_%{params}-nopos_c-all-%{c_start}-%{c_end}.csv]
	dep.file = out/%{experiment}_dictionary-%{params}-nopos.h5
	recipe =
	%{fowler_corpora_py} -c "import pandas as pd; pd.read_hdf('%{file}', key='dictionary')[['ngram']].drop_duplicates()[%{c_start}:%{c_end}].to_csv('%{target}', index=False)"

	# Build the space
	[out/%{experiment}_space_%{params}.h5]
	dep.targets = out/%{experiment}_targets.csv
	dep.context = out/%{experiment}_contexts_%{params}.csv
	recipe =
	%{corpora} bnc cooccurrence \
	-t %{targets} \
	-c %{context} \
	--corpus %{bnc_corpus} \
	-o %{target} \
	--stem

	# Run an experiment
	[experiment_%{experiment}__%{params}]
	type = task
	dep.data = data/%{experiment}.csv
	dep.space = out/%{experiment}_space_%{params}.h5
	recipe =
	%{corpora} similarity %{experiment} \
	-s %{space} \
	--%{experiment}-data %{data}

	[universe]
	type = task
	dep.a = experiment_wordsim353__bnc-pos_c-all-0-3101
	dep.b = experiment_wordsim353__bnc-pos_c-all-101-3101
	dep.c = experiment_wordsim353__bnc-nopos_c-all-0-3101
	dep.d = experiment_wordsim353__bnc-nopos_c-all-101-3101
	dep.e = experiment_rg65__bnc-pos_c-all-101-3101
	dep.f = experiment_rg65__bnc-nopos_c-all-101-3101

	[vacuum]
	type = task
	recipe =
	rm -rf out/ data/