Last active
July 30, 2017 16:42
-
-
Save egonw/2779c0628da0b24b7a113bdc9e0c1a65 to your computer and use it in GitHub Desktop.
Notebook for ContentMine literature mining of nanomaterials
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# INSTALL STUFF | |
$ curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | sudo apt-key add - | |
$ sudo nano /etc/apt/sources.list.d/nodesource.list | |
deb https://deb.nodesource.com/node_6.x stretch main | |
# deb-src https://deb.nodesource.com/node_6.x stretch main | |
$ sudo npm install --global getpapers | |
$ sudo npm install -g ctj | |
# GET PAPERS | |
The following should one article (via website), but find 1.6M articles :( Clearly, something is broken... | |
$ getpapers -q 'JRCNM01000a OR JRCNM01001a OR JRCNM01002a OR JRCNM01003a OR JRCNM01004a OR JRCNM01005a OR JRCNM01100a OR JRCNM01101a OR JRCNM02000a OR JRCNM02001a OR JRCNM02002a OR JRCNM02003a OR JRCNM02004a OR JRCNM02004b OR JRCNM03300a OR JRCNM03301a OR JRCNM04000a OR JRCNM04001a OR JRCNM10201a OR JRCNM10404 OR JRCNM62001a OR JRCNM62002a OR JRCNM62101a' -o nanotox -x | |
This does work, and returns about 470 articles: | |
$ getpapers -q '"titanium dioxide" AND toxicology' -o nanotox -x | |
# NORMALIZING | |
$ norma --project nanotox -i fulltext.xml -o scholarly.html --transform nlm2html | |
# AMI EXTRACTION | |
The following two ami runs do not seem to give results: | |
$ ami2-species --project nanotox -i scholarly.html --sp.species --sp.type genus | |
$ ami2-species --project nanotox -i scholarly.html --sp.species --sp.type binomial | |
Counting words is not a problem: | |
$ ami2-word --project nanotox --w.words wordFrequencies --w.stopwords stopwords.txt | |
It does not find any use of JRCNM codes, confirmed with a grep search. Therefore, a search on NM-100-like codes is persued: | |
$ ami2-regex --project nanotox --context 25 25 -i scholarly.html --r.regex jrccodes.xml | |
The content of jrccodes.xml is: | |
<compoundRegex title="jrc"> | |
<regex fields="jrc" weight="2.0">NM[-]?\d\d\dK</regex> | |
<regex fields="jrc" weight="1.0">NM[-]?\d\d\d</regex> | |
<regex fields="jrc" weight="2.0">JRCNM\d\d\d\d\d\d?[ab]</regex> | |
<regex fields="jrc" weight="1.0">JRCNM\d\d\d\d\d\d?"</regex> | |
</compoundRegex> | |
$ ami2-sequence --project nanotox --filter file\(\*\*/results.xml\) -o sequencesfiles.xml | |
$ node /usr/lib/node_modules/ctj/lib/ctj.js collect -p nanotox -M -o nanotox -s -g bionomial,genus,jrc,frequencies | |
# DATA ANALYSIS | |
$ groovy createTSV.groovy | |
Copy/paste the content into the network.html to visualize it with cytoscape.js in a browser. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import groovy.json.JsonSlurper | |
def inputFile = new File("nanotox/articles.json") | |
def inputJSON = new JsonSlurper().parseText(inputFile.text) | |
inputJSON.keySet().each { pmcid -> | |
if (inputJSON[pmcid].amiResults) { | |
amiResult = inputJSON[pmcid].amiResults | |
if (amiResult.jrc) { | |
amiResult.jrc.each { nmUse -> | |
edgeID = "e" + Math.abs((pmcid + nmUse.value0).hashCode()) | |
println "{ data: { id: '$pmcid', faveColor: 'red' } }, { data: { id: '${nmUse.value0}', faveColor: 'blue' } }, { data: { id: '$edgeID', source: '$pmcid', target: '${nmUse.value0}' } }," | |
} | |
if (amiResult.binomial) { | |
amiResult.binomial.each { nmUse -> | |
edgeID = "e" + Math.abs((pmcid + nmUse.exact).hashCode()) | |
println "{ data: { id: '$pmcid', faveColor: 'red' } }, { data: { id: '${nmUse.exact}', faveColor: 'green' } }, { data: { id: '$edgeID', source: '$pmcid', target: '${nmUse.exact}' } }," | |
} | |
} | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<compoundRegex title="jrc"> | |
<regex fields="jrc" weight="2.0">NM[-]?\d\d\dK</regex> | |
<regex fields="jrc" weight="1.0">NM[-]?\d\d\d</regex> | |
<regex fields="jrc" weight="2.0">JRCNM\d\d\d\d\d\d?[ab]</regex> | |
<regex fields="jrc" weight="1.0">JRCNM\d\d\d\d\d\d?"</regex> | |
</compoundRegex> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<html> | |
<head> | |
<script src="https://cdnjs.cloudflare.com/ajax/libs/cytoscape/3.1.4/cytoscape.js"></script> | |
<script src="https://cdn.rawgit.com/cytoscape/cytoscape.js-cola/1.6.0/cola.js"></script> | |
<script src="https://cdn.rawgit.com/cytoscape/cytoscape.js-cola/1.6.0/cytoscape-cola.js"></script> <style> | |
body { | |
font-family: helvetica; | |
font-size: 14px; | |
} | |
#cy { | |
width: 100%; | |
height: 100%; | |
position: absolute; | |
left: 0; | |
top: 0; | |
z-index: 999; | |
} | |
h1 { | |
opacity: 0.5; | |
font-size: 1em; | |
} | |
</style> | |
</head> | |
<body> | |
<div id="cy"/> | |
<script> | |
var cy = cytoscape({ | |
container: document.getElementById('cy'), | |
style: [ | |
{ | |
selector: 'node', | |
style: { | |
'label': 'data(id)', | |
'background-color': 'data(faveColor)', | |
'color': 'black' | |
} | |
} | |
], | |
elements: | |
[ | |
{ data: { id: 'PMC3841577', faveColor: 'red' } }, { data: { id: 'Acinetobacter baumannii', faveColor: 'green' } }, { data: { id: 'e981162931', source: 'PMC3841577', target: 'Acinetobacter baumannii' } }, | |
{ data: { id: 'PMC3841577', faveColor: 'red' } }, { data: { id: 'E. coli', faveColor: 'green' } }, { data: { id: 'e534625587', source: 'PMC3841577', target: 'E. coli' } }, | |
{ data: { id: 'PMC3841577', faveColor: 'red' } }, { data: { id: 'Enterobacter cloacae', faveColor: 'green' } }, { data: { id: 'e2055312211', source: 'PMC3841577', target: 'Enterobacter cloacae' } }, | |
{ data: { id: 'PMC3841577', faveColor: 'red' } }, { data: { id: 'Escherichia coli', faveColor: 'green' } }, { data: { id: 'e1575606416', source: 'PMC3841577', target: 'Escherichia coli' } }, | |
{ data: { id: 'PMC3841577', faveColor: 'red' } }, { data: { id: 'NM-300', faveColor: 'blue' } }, { data: { id: 'e1081596686', source: 'PMC3841577', target: 'NM-300' } }, | |
{ data: { id: 'PMC3841577', faveColor: 'red' } }, { data: { id: 'NM300', faveColor: 'blue' } }, { data: { id: 'e866168537', source: 'PMC3841577', target: 'NM300' } }, | |
{ data: { id: 'PMC3841577', faveColor: 'red' } }, { data: { id: 'Pseudomonas stutzeri', faveColor: 'green' } }, { data: { id: 'e733316049', source: 'PMC3841577', target: 'Pseudomonas stutzeri' } }, | |
{ data: { id: 'PMC3841577', faveColor: 'red' } }, { data: { id: 'P. stutzeri', faveColor: 'green' } }, { data: { id: 'e1343958775', source: 'PMC3841577', target: 'P. stutzeri' } }, | |
{ data: { id: 'PMC4105399', faveColor: 'red' } }, { data: { id: 'NM-100', faveColor: 'blue' } }, { data: { id: 'e113805838', source: 'PMC4105399', target: 'NM-100' } }, | |
{ data: { id: 'PMC4105399', faveColor: 'red' } }, { data: { id: 'NM-101', faveColor: 'blue' } }, { data: { id: 'e113805839', source: 'PMC4105399', target: 'NM-101' } }, | |
{ data: { id: 'PMC4105399', faveColor: 'red' } }, { data: { id: 'NM-102', faveColor: 'blue' } }, { data: { id: 'e113805840', source: 'PMC4105399', target: 'NM-102' } }, | |
{ data: { id: 'PMC4105399', faveColor: 'red' } }, { data: { id: 'NM-103', faveColor: 'blue' } }, { data: { id: 'e113805841', source: 'PMC4105399', target: 'NM-103' } }, | |
{ data: { id: 'PMC4105399', faveColor: 'red' } }, { data: { id: 'NM-104', faveColor: 'blue' } }, { data: { id: 'e113805842', source: 'PMC4105399', target: 'NM-104' } }, | |
{ data: { id: 'PMC4105399', faveColor: 'red' } }, { data: { id: 'NM-105', faveColor: 'blue' } }, { data: { id: 'e113805843', source: 'PMC4105399', target: 'NM-105' } }, | |
{ data: { id: 'PMC4219084', faveColor: 'red' } }, { data: { id: 'NM-220', faveColor: 'blue' } }, { data: { id: 'e1196886146', source: 'PMC4219084', target: 'NM-220' } }, | |
{ data: { id: 'PMC4440714', faveColor: 'red' } }, { data: { id: 'NM-103', faveColor: 'blue' } }, { data: { id: 'e1249231778', source: 'PMC4440714', target: 'NM-103' } }, | |
{ data: { id: 'PMC4440714', faveColor: 'red' } }, { data: { id: 'NM-104', faveColor: 'blue' } }, { data: { id: 'e1249231777', source: 'PMC4440714', target: 'NM-104' } }, | |
{ data: { id: 'PMC4440714', faveColor: 'red' } }, { data: { id: 'NM-110', faveColor: 'blue' } }, { data: { id: 'e1249231750', source: 'PMC4440714', target: 'NM-110' } }, | |
{ data: { id: 'PMC4440714', faveColor: 'red' } }, { data: { id: 'NM-111', faveColor: 'blue' } }, { data: { id: 'e1249231749', source: 'PMC4440714', target: 'NM-111' } }, | |
{ data: { id: 'PMC4440714', faveColor: 'red' } }, { data: { id: 'NM-200', faveColor: 'blue' } }, { data: { id: 'e1249230820', source: 'PMC4440714', target: 'NM-200' } }, | |
{ data: { id: 'PMC4440714', faveColor: 'red' } }, { data: { id: 'NM-203', faveColor: 'blue' } }, { data: { id: 'e1249230817', source: 'PMC4440714', target: 'NM-203' } }, | |
{ data: { id: 'PMC4555363', faveColor: 'red' } }, { data: { id: 'NM-105', faveColor: 'blue' } }, { data: { id: 'e1728203983', source: 'PMC4555363', target: 'NM-105' } }, | |
{ data: { id: 'PMC4555363', faveColor: 'red' } }, { data: { id: 'NM-211', faveColor: 'blue' } }, { data: { id: 'e1728202995', source: 'PMC4555363', target: 'NM-211' } }, | |
{ data: { id: 'PMC4555363', faveColor: 'red' } }, { data: { id: 'NM-212', faveColor: 'blue' } }, { data: { id: 'e1728202994', source: 'PMC4555363', target: 'NM-212' } }, | |
{ data: { id: 'PMC4555363', faveColor: 'red' } }, { data: { id: 'Rodent osteopontin', faveColor: 'green' } }, { data: { id: 'e1380551857', source: 'PMC4555363', target: 'Rodent osteopontin' } }, | |
{ data: { id: 'PMC4603643', faveColor: 'red' } }, { data: { id: 'NM-212', faveColor: 'blue' } }, { data: { id: 'e1129320529', source: 'PMC4603643', target: 'NM-212' } }, | |
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'Daphnia magna', faveColor: 'green' } }, { data: { id: 'e1657538436', source: 'PMC4630844', target: 'Daphnia magna' } }, | |
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-103', faveColor: 'blue' } }, { data: { id: 'e68105735', source: 'PMC4630844', target: 'NM-103' } }, | |
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-105', faveColor: 'blue' } }, { data: { id: 'e68105733', source: 'PMC4630844', target: 'NM-105' } }, | |
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-110', faveColor: 'blue' } }, { data: { id: 'e68105707', source: 'PMC4630844', target: 'NM-110' } }, | |
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-200', faveColor: 'blue' } }, { data: { id: 'e68104777', source: 'PMC4630844', target: 'NM-200' } }, | |
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-203', faveColor: 'blue' } }, { data: { id: 'e68104774', source: 'PMC4630844', target: 'NM-203' } }, | |
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-220', faveColor: 'blue' } }, { data: { id: 'e68104715', source: 'PMC4630844', target: 'NM-220' } }, | |
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-300', faveColor: 'blue' } }, { data: { id: 'e68103816', source: 'PMC4630844', target: 'NM-300' } }, | |
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-400', faveColor: 'blue' } }, { data: { id: 'e68102855', source: 'PMC4630844', target: 'NM-400' } }, | |
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-401', faveColor: 'blue' } }, { data: { id: 'e68102854', source: 'PMC4630844', target: 'NM-401' } }, | |
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-402', faveColor: 'blue' } }, { data: { id: 'e68102853', source: 'PMC4630844', target: 'NM-402' } }, | |
{ data: { id: 'PMC4792104', faveColor: 'red' } }, { data: { id: 'NM-401', faveColor: 'blue' } }, { data: { id: 'e1137848102', source: 'PMC4792104', target: 'NM-401' } }, | |
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-103', faveColor: 'blue' } }, { data: { id: 'e1397588649', source: 'PMC4833924', target: 'NM-103' } }, | |
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-105', faveColor: 'blue' } }, { data: { id: 'e1397588647', source: 'PMC4833924', target: 'NM-105' } }, | |
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-110', faveColor: 'blue' } }, { data: { id: 'e1397588621', source: 'PMC4833924', target: 'NM-110' } }, | |
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-111', faveColor: 'blue' } }, { data: { id: 'e1397588620', source: 'PMC4833924', target: 'NM-111' } }, | |
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-200', faveColor: 'blue' } }, { data: { id: 'e1397587691', source: 'PMC4833924', target: 'NM-200' } }, | |
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-203', faveColor: 'blue' } }, { data: { id: 'e1397587688', source: 'PMC4833924', target: 'NM-203' } }, | |
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-211', faveColor: 'blue' } }, { data: { id: 'e1397587659', source: 'PMC4833924', target: 'NM-211' } }, | |
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-300', faveColor: 'blue' } }, { data: { id: 'e1397586730', source: 'PMC4833924', target: 'NM-300' } }, | |
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-400', faveColor: 'blue' } }, { data: { id: 'e1397585769', source: 'PMC4833924', target: 'NM-400' } }, | |
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-401', faveColor: 'blue' } }, { data: { id: 'e1397585768', source: 'PMC4833924', target: 'NM-401' } }, | |
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-402', faveColor: 'blue' } }, { data: { id: 'e1397585767', source: 'PMC4833924', target: 'NM-402' } }, | |
{ data: { id: 'PMC5247795', faveColor: 'red' } }, { data: { id: 'NM-105', faveColor: 'blue' } }, { data: { id: 'e1231310283', source: 'PMC5247795', target: 'NM-105' } }, | |
{ data: { id: 'PMC5450058', faveColor: 'red' } }, { data: { id: 'NM-400', faveColor: 'blue' } }, { data: { id: 'e1640982439', source: 'PMC5450058', target: 'NM-400' } }, | |
] | |
}); | |
cy.layout( | |
{ | |
name: "cola", | |
nodeSpacing: 5, | |
edgeLengthVal: 45, | |
animate: true, | |
randomize: false, | |
maxSimulationTime: 1500 | |
} | |
).run(); | |
</script> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment