Skip to content

Instantly share code, notes, and snippets.

@mirfan899
Forked from jexp/ms-concepts-import.sh
Created September 15, 2021 07:23
Show Gist options
  • Save mirfan899/5ded62030de070e2eb851df4a60a2d2b to your computer and use it in GitHub Desktop.
Save mirfan899/5ded62030de070e2eb851df4a60a2d2b to your computer and use it in GitHub Desktop.
Load and query the Microsoft Concept Graph in Neo4j https://concept.research.microsoft.com/Home/Introduction
function import_extract_first {
echo "name:ID(Concept)" > concepts.txt
cat data-concept-instance-relations.txt | cut -d $'\t' -f 1 | sort | uniq >> concepts.txt
echo "name:ID(Instance)" > instances.txt
cat data-concept-instance-relations.txt | cut -d $'\t' -f 2 | sort | uniq >> instances.txt
echo $':END_ID(Concept)\t:START_ID(Instance) relations:int' > is_a.hdr
$NEO4J_HOME/bin/neo4j-import --into concepts.db --id-type string --delimiter TAB --bad-tolerance 13000000 --skip-duplicate-nodes true --skip-bad-relationships true \
--nodes:Concept concepts.txt \
--nodes:Instance instances.txt \
--relationships:IS_A is_a.hdr,data-concept-instance-relations.txt
}
function import_skip_duplicates {
echo $'name:ID(Concept)\t:IGNORE\t:IGNORE' > concept.hdr
echo $':IGNORE name:ID(Instance)\t:IGNORE' > instance.hdr
echo $':END_ID(Concept)\t:START_ID(Instance) relations:int' > is_a.hdr
$NEO4J_HOME/bin/neo4j-import --into concepts.db --id-type string --delimiter TAB --bad-tolerance 13000000 --skip-duplicate-nodes true --skip-bad-relationships true \
--nodes:Concept concept.hdr,data-concept-instance-relations.txt \
--nodes:Instance instance.hdr,data-concept-instance-relations.txt \
--relationships:IS_A is_a.hdr,data-concept-instance-relations.txt
}
if [ ! -f data-concept-instance-relations.txt ]; do
echo Download & Unzip from here: https://concept.research.microsoft.com/Home/Download
exit 1
# curl -IL -o data-concept.zip 'https://concept.research.microsoft.com/Home/DownloadData?key=S54IIWamuQc9YD1WQA6tzKULMMZ79xK6&h=2127477893'
# unzip -j data-concept.zip
fi
export NEO4J_HOME=${NEO4J_HOME-/data/versions/neo4j-enterprise-3.0.7}
import_extract_first
echo $'
CREATE CONSTRAINT ON (i:Instance) ASSERT i.name IS UNIQUE;\n
CREATE CONSTRAINT ON (c:Concept) ASSERT c.name IS UNIQUE;' | $NEO4J_HOME/bin/neo4j-shell -path concepts.db
echo << EOF
IMPORT DONE in 1m 27s 888ms.
Imported:
17878053 nodes
33377320 relationships
51255373 properties
Peak memory usage: 410.36 MB
cypher runtime=compiled profile
MATCH (i:Instance {name:"apple"})-[r:IS_A]->(c:Concept)
RETURN i.name, r.relations, c.name ORDER BY r.relations DESC LIMIT 10;
MATCH (i:Instance {name:"apple"})-[r:IS_A]->(c:Concept)
> RETURN i.name, r.relations, c.name ORDER BY r.relations DESC LIMIT 10;
+----------------------------------------+
| i.name | r.relations | c.name |
+----------------------------------------+
| "apple" | 6315 | "fruit" |
| "apple" | 4353 | "company" |
| "apple" | 1152 | "food" |
| "apple" | 764 | "brand" |
| "apple" | 750 | "fresh fruit" |
| "apple" | 568 | "fruit tree" |
| "apple" | 483 | "crop" |
| "apple" | 280 | "corporation" |
| "apple" | 279 | "manufacturer" |
| "apple" | 257 | "firm" |
+----------------------------------------+
10 rows
20 ms
explain
MATCH (a:Instance {name:"apple"})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:"pie"})
USING INDEX a:Instance(name)
USING INDEX b:Instance(name)
MATCH (c)<-[:IS_A]-(o) WHERE o <> a and o <> b
WITH o, count(*) as freq order by freq desc limit 10
RETURN o.name, freq;
export a_name="apple"
export b_name="pie"
export b_name="ipad"
MATCH (a:Instance {name:{a_name}})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:{b_name}})
USING INDEX a:Instance(name)
USING INDEX b:Instance(name)
MATCH (c)<-[:IS_A]-(o)
WITH o, count(*) as freq order by freq desc SKIP 2 limit 10
RETURN o.name, freq;
export a_name="apple"
export b_name="pie"
# export b_name="ipad"
cypher runtime=compiled profile
MATCH (a:Instance {name:{a_name}})-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b:Instance {name:{b_name}})
USING INDEX a:Instance(name)
USING INDEX b:Instance(name)
RETURN c.name, r1.relations+r2.relations as freq order by freq desc limit 10;
+--------------------------+
| c.name | freq |
+--------------------------+
| "device" | 1139 |
| "mobile device" | 998 |
| "brand" | 772 |
| "item" | 396 |
| "product" | 320 |
| "player" | 201 |
| "technology" | 191 |
| "apple product" | 182 |
| "client" | 147 |
| "portable device" | 139 |
+--------------------------+
10 rows
16 ms
export a_name="apple"
export b_name="pie"
# export b_name="ipad"
MATCH (a:Instance {name:{a_name}})-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b:Instance {name:{b_name}})
USING INDEX a:Instance(name)
USING INDEX b:Instance(name)
RETURN c.name, r1.relations+r2.relations as freq order by freq desc limit 10;
+----------------------+
| c.name | freq |
+----------------------+
| "fruit" | 6316 |
| "food" | 1408 |
| "item" | 345 |
| "product" | 268 |
| "dessert" | 259 |
| "flavor" | 221 |
| "baked goods" | 209 |
| "ingredient" | 184 |
| "business" | 144 |
| "snack" | 144 |
+----------------------+
10 rows
15 ms
MATCH (a:Instance {name:"apple"})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:"pie"})
USING INDEX a:Instance(name)
USING INDEX b:Instance(name)
RETURN count(*);
explain
with "the apple engineer is eating the apple" as sentence
unwind split(sentence," ") as word
match (i:Instance {name:word})
with collect(i) as instances
unwind range(0,length(instances)-2) as idx
with idx, instances[idx] as a, instances[idx+1] as b
MATCH (a)-[:IS_A]->(c:Concept)<-[:IS_A]-(b)
MATCH (c)<-[:IS_A]-(o) WHERE o <> a AND o <> b
WITH idx, a, b, o, count(*) as freq order by freq desc
RETURN idx, a.name, b.name, collect(o.name)[0..5] as meanings order by idx;
+-----------------------------------------------------------------------------------------------------------------------+
| idx | a.name | b.name | meanings |
+-----------------------------------------------------------------------------------------------------------------------+
| 0 | "the" | "apple" | ["vehicle","light","money","tobacco","television"] |
| 1 | "apple" | "engineer" | ["bank","university","doctor","school","google"] |
| 2 | "engineer" | "is" | ["coos","fructose","armour","starseeds","centaur"] |
| 3 | "is" | "eating" | ["compensation","ukuyigxoba","next","offside","process learning"] |
| 4 | "eating" | "the" | ["effective honest","crushed leg","drogue parachutes","marine sergeant","nano sized"] |
| 5 | "the" | "apple" | ["family","sandwich","vehicle","poison","door"] |
+-----------------------------------------------------------------------------------------------------------------------+
with "the apple engineer is eating the apple" as sentence
unwind split(sentence," ") as word
match (i:Instance {name:word})
WHERE NOT (i)-[:IS_A]->(:Concept {name:"stop words"})
with collect(i) as instances
unwind range(0,length(instances)-2) as idx
with idx, instances[idx] as a, instances[idx+1] as b
MATCH (a)-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b)
WITH idx, a, b, c, r1.relations+r2.relations as freq order by freq desc
RETURN idx, a.name, b.name, collect({concept:c.name, relations:freq})[0..5] as concepts order by idx;
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| idx | a.name | b.name | concepts |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 0 | "apple" | "engineer" | [{concept -> "company", relations -> 4363},{concept -> "firm", relations -> 260},{concept -> "product", relations -> 182},{concept -> "person", relations -> 172},{concept -> "vendor", relations -> 169}] |
| 1 | "engineer" | "is" | [{concept -> "word", relations -> 25}] |
| 2 | "is" | "eating" | [{concept -> "word", relations -> 4}] |
| 3 | "eating" | "apple" | [{concept -> "fruit", relations -> 6316},{concept -> "activity", relations -> 736},{concept -> "item", relations -> 253},{concept -> "product", relations -> 182},{concept -> "business", relations -> 144}] |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
with "the apple engineer is eating the apple" as sentence
unwind split(sentence," ") as word
match (i:Instance {name:word})
MATCH (i)-[r1:IS_A]->(c:Concept {name:"stop words"})
return i,c limit 10;
EOF
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment