jexp/ms-concepts-import.sh

## ms-concepts-import.sh
function import_extract_first {
  echo "name:ID(Concept)" > concepts.txt
  cat data-concept-instance-relations.txt | cut -d $'\t' -f 1 | sort | uniq >> concepts.txt

  echo "name:ID(Instance)" > instances.txt
  cat data-concept-instance-relations.txt | cut -d $'\t' -f 2 | sort | uniq >> instances.txt

  echo $':END_ID(Concept)\t:START_ID(Instance)	relations:int' > is_a.hdr

  $NEO4J_HOME/bin/neo4j-import --into concepts.db --id-type string --delimiter TAB --bad-tolerance 13000000 --skip-duplicate-nodes true --skip-bad-relationships true \
  --nodes:Concept concepts.txt \
  --nodes:Instance instances.txt \
  --relationships:IS_A is_a.hdr,data-concept-instance-relations.txt
}

function import_skip_duplicates {
  echo $'name:ID(Concept)\t:IGNORE\t:IGNORE' > concept.hdr
  echo $':IGNORE	name:ID(Instance)\t:IGNORE' > instance.hdr
  echo $':END_ID(Concept)\t:START_ID(Instance)	relations:int' > is_a.hdr

  $NEO4J_HOME/bin/neo4j-import --into concepts.db --id-type string --delimiter TAB --bad-tolerance 13000000 --skip-duplicate-nodes true --skip-bad-relationships true \
  --nodes:Concept concept.hdr,data-concept-instance-relations.txt \
  --nodes:Instance instance.hdr,data-concept-instance-relations.txt \
  --relationships:IS_A is_a.hdr,data-concept-instance-relations.txt
}

if [ ! -f data-concept-instance-relations.txt ]; do
  echo Download & Unzip from here: https://concept.research.microsoft.com/Home/Download
  exit 1
#  curl -IL -o data-concept.zip 'https://concept.research.microsoft.com/Home/DownloadData?key=S54IIWamuQc9YD1WQA6tzKULMMZ79xK6&h=2127477893'
#  unzip -j data-concept.zip
fi

export NEO4J_HOME=${NEO4J_HOME-/data/versions/neo4j-enterprise-3.0.7}

import_extract_first

echo $'
CREATE CONSTRAINT ON (i:Instance) ASSERT i.name IS  UNIQUE;\n
CREATE CONSTRAINT ON (c:Concept)  ASSERT c.name IS  UNIQUE;' | $NEO4J_HOME/bin/neo4j-shell -path concepts.db

echo << EOF

IMPORT DONE in 1m 27s 888ms.
Imported:
  17878053 nodes
  33377320 relationships
  51255373 properties
Peak memory usage: 410.36 MB

cypher runtime=compiled profile
MATCH (i:Instance {name:"apple"})-[r:IS_A]->(c:Concept)
RETURN i.name, r.relations, c.name ORDER BY r.relations DESC LIMIT 10;

MATCH (i:Instance {name:"apple"})-[r:IS_A]->(c:Concept)
> RETURN i.name, r.relations, c.name ORDER BY r.relations DESC LIMIT 10;
+----------------------------------------+
| i.name  | r.relations | c.name         |
+----------------------------------------+
| "apple" | 6315        | "fruit"        |
| "apple" | 4353        | "company"      |
| "apple" | 1152        | "food"         |
| "apple" | 764         | "brand"        |
| "apple" | 750         | "fresh fruit"  |
| "apple" | 568         | "fruit tree"   |
| "apple" | 483         | "crop"         |
| "apple" | 280         | "corporation"  |
| "apple" | 279         | "manufacturer" |
| "apple" | 257         | "firm"         |
+----------------------------------------+
10 rows
20 ms

explain
MATCH (a:Instance {name:"apple"})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:"pie"})
USING INDEX a:Instance(name)
USING INDEX b:Instance(name)
MATCH (c)<-[:IS_A]-(o) WHERE o <> a and o <> b
WITH o, count(*) as freq order by freq desc limit 10
RETURN o.name, freq;

export a_name="apple"
export b_name="pie"
export b_name="ipad"
MATCH (a:Instance {name:{a_name}})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:{b_name}})
USING INDEX a:Instance(name)
USING INDEX b:Instance(name)
MATCH (c)<-[:IS_A]-(o)
WITH o, count(*) as freq order by freq desc SKIP 2 limit 10
RETURN o.name, freq;


export a_name="apple"
export b_name="pie"
# export b_name="ipad"
cypher runtime=compiled profile
MATCH (a:Instance {name:{a_name}})-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b:Instance {name:{b_name}})
USING INDEX a:Instance(name)
USING INDEX b:Instance(name)
RETURN c.name, r1.relations+r2.relations as freq order by freq desc limit 10;


+--------------------------+
| c.name            | freq |
+--------------------------+
| "device"          | 1139 |
| "mobile device"   | 998  |
| "brand"           | 772  |
| "item"            | 396  |
| "product"         | 320  |
| "player"          | 201  |
| "technology"      | 191  |
| "apple product"   | 182  |
| "client"          | 147  |
| "portable device" | 139  |
+--------------------------+
10 rows
16 ms


export a_name="apple"
export b_name="pie"
# export b_name="ipad"
MATCH (a:Instance {name:{a_name}})-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b:Instance {name:{b_name}})
USING INDEX a:Instance(name)
USING INDEX b:Instance(name)
RETURN c.name, r1.relations+r2.relations as freq order by freq desc limit 10;


+----------------------+
| c.name        | freq |
+----------------------+
| "fruit"       | 6316 |
| "food"        | 1408 |
| "item"        | 345  |
| "product"     | 268  |
| "dessert"     | 259  |
| "flavor"      | 221  |
| "baked goods" | 209  |
| "ingredient"  | 184  |
| "business"    | 144  |
| "snack"       | 144  |
+----------------------+
10 rows
15 ms


MATCH (a:Instance {name:"apple"})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:"pie"})
USING INDEX a:Instance(name)
USING INDEX b:Instance(name)
RETURN count(*);

explain
with "the apple engineer is eating the apple" as sentence
unwind split(sentence," ") as word
match (i:Instance {name:word})
with collect(i) as instances
unwind range(0,length(instances)-2) as idx
with idx, instances[idx] as a, instances[idx+1] as b
MATCH (a)-[:IS_A]->(c:Concept)<-[:IS_A]-(b)
MATCH (c)<-[:IS_A]-(o) WHERE o <> a AND o <> b
WITH idx, a, b, o, count(*) as freq order by freq desc
RETURN idx, a.name, b.name, collect(o.name)[0..5] as meanings order by idx;


+-----------------------------------------------------------------------------------------------------------------------+
| idx | a.name     | b.name     | meanings                                                                              |
+-----------------------------------------------------------------------------------------------------------------------+
| 0   | "the"      | "apple"    | ["vehicle","light","money","tobacco","television"]                                    |
| 1   | "apple"    | "engineer" | ["bank","university","doctor","school","google"]                                      |
| 2   | "engineer" | "is"       | ["coos","fructose","armour","starseeds","centaur"]                                    |
| 3   | "is"       | "eating"   | ["compensation","ukuyigxoba","next","offside","process learning"]                     |
| 4   | "eating"   | "the"      | ["effective honest","crushed leg","drogue parachutes","marine sergeant","nano sized"] |
| 5   | "the"      | "apple"    | ["family","sandwich","vehicle","poison","door"]                                       |
+-----------------------------------------------------------------------------------------------------------------------+

with "the apple engineer is eating the apple" as sentence
unwind split(sentence," ") as word
match (i:Instance {name:word})
WHERE NOT (i)-[:IS_A]->(:Concept {name:"stop words"})
with collect(i) as instances
unwind range(0,length(instances)-2) as idx
with idx, instances[idx] as a, instances[idx+1] as b
MATCH (a)-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b)
WITH idx, a, b, c, r1.relations+r2.relations as freq order by freq desc
RETURN idx, a.name, b.name, collect({concept:c.name, relations:freq})[0..5] as concepts order by idx;


+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| idx | a.name     | b.name     | concepts                                                                                                                                                                                                     |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 0   | "apple"    | "engineer" | [{concept -> "company", relations -> 4363},{concept -> "firm", relations -> 260},{concept -> "product", relations -> 182},{concept -> "person", relations -> 172},{concept -> "vendor", relations -> 169}]   |
| 1   | "engineer" | "is"       | [{concept -> "word", relations -> 25}]                                                                                                                                                                       |
| 2   | "is"       | "eating"   | [{concept -> "word", relations -> 4}]                                                                                                                                                                        |
| 3   | "eating"   | "apple"    | [{concept -> "fruit", relations -> 6316},{concept -> "activity", relations -> 736},{concept -> "item", relations -> 253},{concept -> "product", relations -> 182},{concept -> "business", relations -> 144}] |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


with "the apple engineer is eating the apple" as sentence
unwind split(sentence," ") as word
match (i:Instance {name:word})
MATCH (i)-[r1:IS_A]->(c:Concept {name:"stop words"})
return i,c limit 10;

EOF
	function import_extract_first {
	echo "name:ID(Concept)" > concepts.txt
	cat data-concept-instance-relations.txt \| cut -d $'\t' -f 1 \| sort \| uniq >> concepts.txt

	echo "name:ID(Instance)" > instances.txt
	cat data-concept-instance-relations.txt \| cut -d $'\t' -f 2 \| sort \| uniq >> instances.txt

	echo $':END_ID(Concept)\t:START_ID(Instance) relations:int' > is_a.hdr

	$NEO4J_HOME/bin/neo4j-import --into concepts.db --id-type string --delimiter TAB --bad-tolerance 13000000 --skip-duplicate-nodes true --skip-bad-relationships true \
	--nodes:Concept concepts.txt \
	--nodes:Instance instances.txt \
	--relationships:IS_A is_a.hdr,data-concept-instance-relations.txt
	}

	function import_skip_duplicates {
	echo $'name:ID(Concept)\t:IGNORE\t:IGNORE' > concept.hdr
	echo $':IGNORE name:ID(Instance)\t:IGNORE' > instance.hdr
	echo $':END_ID(Concept)\t:START_ID(Instance) relations:int' > is_a.hdr

	$NEO4J_HOME/bin/neo4j-import --into concepts.db --id-type string --delimiter TAB --bad-tolerance 13000000 --skip-duplicate-nodes true --skip-bad-relationships true \
	--nodes:Concept concept.hdr,data-concept-instance-relations.txt \
	--nodes:Instance instance.hdr,data-concept-instance-relations.txt \
	--relationships:IS_A is_a.hdr,data-concept-instance-relations.txt
	}

	if [ ! -f data-concept-instance-relations.txt ]; do
	echo Download & Unzip from here: https://concept.research.microsoft.com/Home/Download
	exit 1
	# curl -IL -o data-concept.zip 'https://concept.research.microsoft.com/Home/DownloadData?key=S54IIWamuQc9YD1WQA6tzKULMMZ79xK6&h=2127477893'
	# unzip -j data-concept.zip
	fi

	export NEO4J_HOME=${NEO4J_HOME-/data/versions/neo4j-enterprise-3.0.7}

	import_extract_first

	echo $'
	CREATE CONSTRAINT ON (i:Instance) ASSERT i.name IS UNIQUE;\n
	CREATE CONSTRAINT ON (c:Concept) ASSERT c.name IS UNIQUE;' \| $NEO4J_HOME/bin/neo4j-shell -path concepts.db

	echo << EOF

	IMPORT DONE in 1m 27s 888ms.
	Imported:
	17878053 nodes
	33377320 relationships
	51255373 properties
	Peak memory usage: 410.36 MB

	cypher runtime=compiled profile
	MATCH (i:Instance {name:"apple"})-[r:IS_A]->(c:Concept)
	RETURN i.name, r.relations, c.name ORDER BY r.relations DESC LIMIT 10;

	MATCH (i:Instance {name:"apple"})-[r:IS_A]->(c:Concept)
	> RETURN i.name, r.relations, c.name ORDER BY r.relations DESC LIMIT 10;
	+----------------------------------------+
	\| i.name \| r.relations \| c.name \|
	+----------------------------------------+
	\| "apple" \| 6315 \| "fruit" \|
	\| "apple" \| 4353 \| "company" \|
	\| "apple" \| 1152 \| "food" \|
	\| "apple" \| 764 \| "brand" \|
	\| "apple" \| 750 \| "fresh fruit" \|
	\| "apple" \| 568 \| "fruit tree" \|
	\| "apple" \| 483 \| "crop" \|
	\| "apple" \| 280 \| "corporation" \|
	\| "apple" \| 279 \| "manufacturer" \|
	\| "apple" \| 257 \| "firm" \|
	+----------------------------------------+
	10 rows
	20 ms

	explain
	MATCH (a:Instance {name:"apple"})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:"pie"})
	USING INDEX a:Instance(name)
	USING INDEX b:Instance(name)
	MATCH (c)<-[:IS_A]-(o) WHERE o <> a and o <> b
	WITH o, count(*) as freq order by freq desc limit 10
	RETURN o.name, freq;

	export a_name="apple"
	export b_name="pie"
	export b_name="ipad"
	MATCH (a:Instance {name:{a_name}})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:{b_name}})
	USING INDEX a:Instance(name)
	USING INDEX b:Instance(name)
	MATCH (c)<-[:IS_A]-(o)
	WITH o, count(*) as freq order by freq desc SKIP 2 limit 10
	RETURN o.name, freq;


	export a_name="apple"
	export b_name="pie"
	# export b_name="ipad"
	cypher runtime=compiled profile
	MATCH (a:Instance {name:{a_name}})-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b:Instance {name:{b_name}})
	USING INDEX a:Instance(name)
	USING INDEX b:Instance(name)
	RETURN c.name, r1.relations+r2.relations as freq order by freq desc limit 10;


	+--------------------------+
	\| c.name \| freq \|
	+--------------------------+
	\| "device" \| 1139 \|
	\| "mobile device" \| 998 \|
	\| "brand" \| 772 \|
	\| "item" \| 396 \|
	\| "product" \| 320 \|
	\| "player" \| 201 \|
	\| "technology" \| 191 \|
	\| "apple product" \| 182 \|
	\| "client" \| 147 \|
	\| "portable device" \| 139 \|
	+--------------------------+
	10 rows
	16 ms



	export a_name="apple"
	export b_name="pie"
	# export b_name="ipad"
	MATCH (a:Instance {name:{a_name}})-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b:Instance {name:{b_name}})
	USING INDEX a:Instance(name)
	USING INDEX b:Instance(name)
	RETURN c.name, r1.relations+r2.relations as freq order by freq desc limit 10;


	+----------------------+
	\| c.name \| freq \|
	+----------------------+
	\| "fruit" \| 6316 \|
	\| "food" \| 1408 \|
	\| "item" \| 345 \|
	\| "product" \| 268 \|
	\| "dessert" \| 259 \|
	\| "flavor" \| 221 \|
	\| "baked goods" \| 209 \|
	\| "ingredient" \| 184 \|
	\| "business" \| 144 \|
	\| "snack" \| 144 \|
	+----------------------+
	10 rows
	15 ms


	MATCH (a:Instance {name:"apple"})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:"pie"})
	USING INDEX a:Instance(name)
	USING INDEX b:Instance(name)
	RETURN count(*);

	explain
	with "the apple engineer is eating the apple" as sentence
	unwind split(sentence," ") as word
	match (i:Instance {name:word})
	with collect(i) as instances
	unwind range(0,length(instances)-2) as idx
	with idx, instances[idx] as a, instances[idx+1] as b
	MATCH (a)-[:IS_A]->(c:Concept)<-[:IS_A]-(b)
	MATCH (c)<-[:IS_A]-(o) WHERE o <> a AND o <> b
	WITH idx, a, b, o, count(*) as freq order by freq desc
	RETURN idx, a.name, b.name, collect(o.name)[0..5] as meanings order by idx;


	+-----------------------------------------------------------------------------------------------------------------------+
	\| idx \| a.name \| b.name \| meanings \|
	+-----------------------------------------------------------------------------------------------------------------------+
	\| 0 \| "the" \| "apple" \| ["vehicle","light","money","tobacco","television"] \|
	\| 1 \| "apple" \| "engineer" \| ["bank","university","doctor","school","google"] \|
	\| 2 \| "engineer" \| "is" \| ["coos","fructose","armour","starseeds","centaur"] \|
	\| 3 \| "is" \| "eating" \| ["compensation","ukuyigxoba","next","offside","process learning"] \|
	\| 4 \| "eating" \| "the" \| ["effective honest","crushed leg","drogue parachutes","marine sergeant","nano sized"] \|
	\| 5 \| "the" \| "apple" \| ["family","sandwich","vehicle","poison","door"] \|
	+-----------------------------------------------------------------------------------------------------------------------+

	with "the apple engineer is eating the apple" as sentence
	unwind split(sentence," ") as word
	match (i:Instance {name:word})
	WHERE NOT (i)-[:IS_A]->(:Concept {name:"stop words"})
	with collect(i) as instances
	unwind range(0,length(instances)-2) as idx
	with idx, instances[idx] as a, instances[idx+1] as b
	MATCH (a)-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b)
	WITH idx, a, b, c, r1.relations+r2.relations as freq order by freq desc
	RETURN idx, a.name, b.name, collect({concept:c.name, relations:freq})[0..5] as concepts order by idx;


	+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
	\| idx \| a.name \| b.name \| concepts \|
	+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
	\| 0 \| "apple" \| "engineer" \| [{concept -> "company", relations -> 4363},{concept -> "firm", relations -> 260},{concept -> "product", relations -> 182},{concept -> "person", relations -> 172},{concept -> "vendor", relations -> 169}] \|
	\| 1 \| "engineer" \| "is" \| [{concept -> "word", relations -> 25}] \|
	\| 2 \| "is" \| "eating" \| [{concept -> "word", relations -> 4}] \|
	\| 3 \| "eating" \| "apple" \| [{concept -> "fruit", relations -> 6316},{concept -> "activity", relations -> 736},{concept -> "item", relations -> 253},{concept -> "product", relations -> 182},{concept -> "business", relations -> 144}] \|
	+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


	with "the apple engineer is eating the apple" as sentence
	unwind split(sentence," ") as word
	match (i:Instance {name:word})
	MATCH (i)-[r1:IS_A]->(c:Concept {name:"stop words"})
	return i,c limit 10;

	EOF