Skip to content

Instantly share code, notes, and snippets.

@egonw
Last active July 30, 2017 16:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save egonw/2779c0628da0b24b7a113bdc9e0c1a65 to your computer and use it in GitHub Desktop.
Save egonw/2779c0628da0b24b7a113bdc9e0c1a65 to your computer and use it in GitHub Desktop.
Notebook for ContentMine literature mining of nanomaterials
# INSTALL STUFF
$ curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | sudo apt-key add -
$ sudo nano /etc/apt/sources.list.d/nodesource.list
deb https://deb.nodesource.com/node_6.x stretch main
# deb-src https://deb.nodesource.com/node_6.x stretch main
$ sudo npm install --global getpapers
$ sudo npm install -g ctj
# GET PAPERS
The following should one article (via website), but find 1.6M articles :( Clearly, something is broken...
$ getpapers -q 'JRCNM01000a OR JRCNM01001a OR JRCNM01002a OR JRCNM01003a OR JRCNM01004a OR JRCNM01005a OR JRCNM01100a OR JRCNM01101a OR JRCNM02000a OR JRCNM02001a OR JRCNM02002a OR JRCNM02003a OR JRCNM02004a OR JRCNM02004b OR JRCNM03300a OR JRCNM03301a OR JRCNM04000a OR JRCNM04001a OR JRCNM10201a OR JRCNM10404 OR JRCNM62001a OR JRCNM62002a OR JRCNM62101a' -o nanotox -x
This does work, and returns about 470 articles:
$ getpapers -q '"titanium dioxide" AND toxicology' -o nanotox -x
# NORMALIZING
$ norma --project nanotox -i fulltext.xml -o scholarly.html --transform nlm2html
# AMI EXTRACTION
The following two ami runs do not seem to give results:
$ ami2-species --project nanotox -i scholarly.html --sp.species --sp.type genus
$ ami2-species --project nanotox -i scholarly.html --sp.species --sp.type binomial
Counting words is not a problem:
$ ami2-word --project nanotox --w.words wordFrequencies --w.stopwords stopwords.txt
It does not find any use of JRCNM codes, confirmed with a grep search. Therefore, a search on NM-100-like codes is persued:
$ ami2-regex --project nanotox --context 25 25 -i scholarly.html --r.regex jrccodes.xml
The content of jrccodes.xml is:
<compoundRegex title="jrc">
<regex fields="jrc" weight="2.0">NM[-]?\d\d\dK</regex>
<regex fields="jrc" weight="1.0">NM[-]?\d\d\d</regex>
<regex fields="jrc" weight="2.0">JRCNM\d\d\d\d\d\d?[ab]</regex>
<regex fields="jrc" weight="1.0">JRCNM\d\d\d\d\d\d?"</regex>
</compoundRegex>
$ ami2-sequence --project nanotox --filter file\(\*\*/results.xml\) -o sequencesfiles.xml
$ node /usr/lib/node_modules/ctj/lib/ctj.js collect -p nanotox -M -o nanotox -s -g bionomial,genus,jrc,frequencies
# DATA ANALYSIS
$ groovy createTSV.groovy
Copy/paste the content into the network.html to visualize it with cytoscape.js in a browser.
import groovy.json.JsonSlurper
def inputFile = new File("nanotox/articles.json")
def inputJSON = new JsonSlurper().parseText(inputFile.text)
inputJSON.keySet().each { pmcid ->
if (inputJSON[pmcid].amiResults) {
amiResult = inputJSON[pmcid].amiResults
if (amiResult.jrc) {
amiResult.jrc.each { nmUse ->
edgeID = "e" + Math.abs((pmcid + nmUse.value0).hashCode())
println "{ data: { id: '$pmcid', faveColor: 'red' } }, { data: { id: '${nmUse.value0}', faveColor: 'blue' } }, { data: { id: '$edgeID', source: '$pmcid', target: '${nmUse.value0}' } },"
}
if (amiResult.binomial) {
amiResult.binomial.each { nmUse ->
edgeID = "e" + Math.abs((pmcid + nmUse.exact).hashCode())
println "{ data: { id: '$pmcid', faveColor: 'red' } }, { data: { id: '${nmUse.exact}', faveColor: 'green' } }, { data: { id: '$edgeID', source: '$pmcid', target: '${nmUse.exact}' } },"
}
}
}
}
}
<compoundRegex title="jrc">
<regex fields="jrc" weight="2.0">NM[-]?\d\d\dK</regex>
<regex fields="jrc" weight="1.0">NM[-]?\d\d\d</regex>
<regex fields="jrc" weight="2.0">JRCNM\d\d\d\d\d\d?[ab]</regex>
<regex fields="jrc" weight="1.0">JRCNM\d\d\d\d\d\d?"</regex>
</compoundRegex>
<html>
<head>
<script src="https://cdnjs.cloudflare.com/ajax/libs/cytoscape/3.1.4/cytoscape.js"></script>
<script src="https://cdn.rawgit.com/cytoscape/cytoscape.js-cola/1.6.0/cola.js"></script>
<script src="https://cdn.rawgit.com/cytoscape/cytoscape.js-cola/1.6.0/cytoscape-cola.js"></script> <style>
body {
font-family: helvetica;
font-size: 14px;
}
#cy {
width: 100%;
height: 100%;
position: absolute;
left: 0;
top: 0;
z-index: 999;
}
h1 {
opacity: 0.5;
font-size: 1em;
}
</style>
</head>
<body>
<div id="cy"/>
<script>
var cy = cytoscape({
container: document.getElementById('cy'),
style: [
{
selector: 'node',
style: {
'label': 'data(id)',
'background-color': 'data(faveColor)',
'color': 'black'
}
}
],
elements:
[
{ data: { id: 'PMC3841577', faveColor: 'red' } }, { data: { id: 'Acinetobacter baumannii', faveColor: 'green' } }, { data: { id: 'e981162931', source: 'PMC3841577', target: 'Acinetobacter baumannii' } },
{ data: { id: 'PMC3841577', faveColor: 'red' } }, { data: { id: 'E. coli', faveColor: 'green' } }, { data: { id: 'e534625587', source: 'PMC3841577', target: 'E. coli' } },
{ data: { id: 'PMC3841577', faveColor: 'red' } }, { data: { id: 'Enterobacter cloacae', faveColor: 'green' } }, { data: { id: 'e2055312211', source: 'PMC3841577', target: 'Enterobacter cloacae' } },
{ data: { id: 'PMC3841577', faveColor: 'red' } }, { data: { id: 'Escherichia coli', faveColor: 'green' } }, { data: { id: 'e1575606416', source: 'PMC3841577', target: 'Escherichia coli' } },
{ data: { id: 'PMC3841577', faveColor: 'red' } }, { data: { id: 'NM-300', faveColor: 'blue' } }, { data: { id: 'e1081596686', source: 'PMC3841577', target: 'NM-300' } },
{ data: { id: 'PMC3841577', faveColor: 'red' } }, { data: { id: 'NM300', faveColor: 'blue' } }, { data: { id: 'e866168537', source: 'PMC3841577', target: 'NM300' } },
{ data: { id: 'PMC3841577', faveColor: 'red' } }, { data: { id: 'Pseudomonas stutzeri', faveColor: 'green' } }, { data: { id: 'e733316049', source: 'PMC3841577', target: 'Pseudomonas stutzeri' } },
{ data: { id: 'PMC3841577', faveColor: 'red' } }, { data: { id: 'P. stutzeri', faveColor: 'green' } }, { data: { id: 'e1343958775', source: 'PMC3841577', target: 'P. stutzeri' } },
{ data: { id: 'PMC4105399', faveColor: 'red' } }, { data: { id: 'NM-100', faveColor: 'blue' } }, { data: { id: 'e113805838', source: 'PMC4105399', target: 'NM-100' } },
{ data: { id: 'PMC4105399', faveColor: 'red' } }, { data: { id: 'NM-101', faveColor: 'blue' } }, { data: { id: 'e113805839', source: 'PMC4105399', target: 'NM-101' } },
{ data: { id: 'PMC4105399', faveColor: 'red' } }, { data: { id: 'NM-102', faveColor: 'blue' } }, { data: { id: 'e113805840', source: 'PMC4105399', target: 'NM-102' } },
{ data: { id: 'PMC4105399', faveColor: 'red' } }, { data: { id: 'NM-103', faveColor: 'blue' } }, { data: { id: 'e113805841', source: 'PMC4105399', target: 'NM-103' } },
{ data: { id: 'PMC4105399', faveColor: 'red' } }, { data: { id: 'NM-104', faveColor: 'blue' } }, { data: { id: 'e113805842', source: 'PMC4105399', target: 'NM-104' } },
{ data: { id: 'PMC4105399', faveColor: 'red' } }, { data: { id: 'NM-105', faveColor: 'blue' } }, { data: { id: 'e113805843', source: 'PMC4105399', target: 'NM-105' } },
{ data: { id: 'PMC4219084', faveColor: 'red' } }, { data: { id: 'NM-220', faveColor: 'blue' } }, { data: { id: 'e1196886146', source: 'PMC4219084', target: 'NM-220' } },
{ data: { id: 'PMC4440714', faveColor: 'red' } }, { data: { id: 'NM-103', faveColor: 'blue' } }, { data: { id: 'e1249231778', source: 'PMC4440714', target: 'NM-103' } },
{ data: { id: 'PMC4440714', faveColor: 'red' } }, { data: { id: 'NM-104', faveColor: 'blue' } }, { data: { id: 'e1249231777', source: 'PMC4440714', target: 'NM-104' } },
{ data: { id: 'PMC4440714', faveColor: 'red' } }, { data: { id: 'NM-110', faveColor: 'blue' } }, { data: { id: 'e1249231750', source: 'PMC4440714', target: 'NM-110' } },
{ data: { id: 'PMC4440714', faveColor: 'red' } }, { data: { id: 'NM-111', faveColor: 'blue' } }, { data: { id: 'e1249231749', source: 'PMC4440714', target: 'NM-111' } },
{ data: { id: 'PMC4440714', faveColor: 'red' } }, { data: { id: 'NM-200', faveColor: 'blue' } }, { data: { id: 'e1249230820', source: 'PMC4440714', target: 'NM-200' } },
{ data: { id: 'PMC4440714', faveColor: 'red' } }, { data: { id: 'NM-203', faveColor: 'blue' } }, { data: { id: 'e1249230817', source: 'PMC4440714', target: 'NM-203' } },
{ data: { id: 'PMC4555363', faveColor: 'red' } }, { data: { id: 'NM-105', faveColor: 'blue' } }, { data: { id: 'e1728203983', source: 'PMC4555363', target: 'NM-105' } },
{ data: { id: 'PMC4555363', faveColor: 'red' } }, { data: { id: 'NM-211', faveColor: 'blue' } }, { data: { id: 'e1728202995', source: 'PMC4555363', target: 'NM-211' } },
{ data: { id: 'PMC4555363', faveColor: 'red' } }, { data: { id: 'NM-212', faveColor: 'blue' } }, { data: { id: 'e1728202994', source: 'PMC4555363', target: 'NM-212' } },
{ data: { id: 'PMC4555363', faveColor: 'red' } }, { data: { id: 'Rodent osteopontin', faveColor: 'green' } }, { data: { id: 'e1380551857', source: 'PMC4555363', target: 'Rodent osteopontin' } },
{ data: { id: 'PMC4603643', faveColor: 'red' } }, { data: { id: 'NM-212', faveColor: 'blue' } }, { data: { id: 'e1129320529', source: 'PMC4603643', target: 'NM-212' } },
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'Daphnia magna', faveColor: 'green' } }, { data: { id: 'e1657538436', source: 'PMC4630844', target: 'Daphnia magna' } },
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-103', faveColor: 'blue' } }, { data: { id: 'e68105735', source: 'PMC4630844', target: 'NM-103' } },
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-105', faveColor: 'blue' } }, { data: { id: 'e68105733', source: 'PMC4630844', target: 'NM-105' } },
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-110', faveColor: 'blue' } }, { data: { id: 'e68105707', source: 'PMC4630844', target: 'NM-110' } },
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-200', faveColor: 'blue' } }, { data: { id: 'e68104777', source: 'PMC4630844', target: 'NM-200' } },
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-203', faveColor: 'blue' } }, { data: { id: 'e68104774', source: 'PMC4630844', target: 'NM-203' } },
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-220', faveColor: 'blue' } }, { data: { id: 'e68104715', source: 'PMC4630844', target: 'NM-220' } },
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-300', faveColor: 'blue' } }, { data: { id: 'e68103816', source: 'PMC4630844', target: 'NM-300' } },
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-400', faveColor: 'blue' } }, { data: { id: 'e68102855', source: 'PMC4630844', target: 'NM-400' } },
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-401', faveColor: 'blue' } }, { data: { id: 'e68102854', source: 'PMC4630844', target: 'NM-401' } },
{ data: { id: 'PMC4630844', faveColor: 'red' } }, { data: { id: 'NM-402', faveColor: 'blue' } }, { data: { id: 'e68102853', source: 'PMC4630844', target: 'NM-402' } },
{ data: { id: 'PMC4792104', faveColor: 'red' } }, { data: { id: 'NM-401', faveColor: 'blue' } }, { data: { id: 'e1137848102', source: 'PMC4792104', target: 'NM-401' } },
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-103', faveColor: 'blue' } }, { data: { id: 'e1397588649', source: 'PMC4833924', target: 'NM-103' } },
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-105', faveColor: 'blue' } }, { data: { id: 'e1397588647', source: 'PMC4833924', target: 'NM-105' } },
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-110', faveColor: 'blue' } }, { data: { id: 'e1397588621', source: 'PMC4833924', target: 'NM-110' } },
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-111', faveColor: 'blue' } }, { data: { id: 'e1397588620', source: 'PMC4833924', target: 'NM-111' } },
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-200', faveColor: 'blue' } }, { data: { id: 'e1397587691', source: 'PMC4833924', target: 'NM-200' } },
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-203', faveColor: 'blue' } }, { data: { id: 'e1397587688', source: 'PMC4833924', target: 'NM-203' } },
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-211', faveColor: 'blue' } }, { data: { id: 'e1397587659', source: 'PMC4833924', target: 'NM-211' } },
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-300', faveColor: 'blue' } }, { data: { id: 'e1397586730', source: 'PMC4833924', target: 'NM-300' } },
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-400', faveColor: 'blue' } }, { data: { id: 'e1397585769', source: 'PMC4833924', target: 'NM-400' } },
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-401', faveColor: 'blue' } }, { data: { id: 'e1397585768', source: 'PMC4833924', target: 'NM-401' } },
{ data: { id: 'PMC4833924', faveColor: 'red' } }, { data: { id: 'NM-402', faveColor: 'blue' } }, { data: { id: 'e1397585767', source: 'PMC4833924', target: 'NM-402' } },
{ data: { id: 'PMC5247795', faveColor: 'red' } }, { data: { id: 'NM-105', faveColor: 'blue' } }, { data: { id: 'e1231310283', source: 'PMC5247795', target: 'NM-105' } },
{ data: { id: 'PMC5450058', faveColor: 'red' } }, { data: { id: 'NM-400', faveColor: 'blue' } }, { data: { id: 'e1640982439', source: 'PMC5450058', target: 'NM-400' } },
]
});
cy.layout(
{
name: "cola",
nodeSpacing: 5,
edgeLengthVal: 45,
animate: true,
randomize: false,
maxSimulationTime: 1500
}
).run();
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment