-
download the extension (crx is a zip)
curl -L -o memento.zip "https://clients2.google.com/service/update2/crx?response=redirect&x=id%3Djgbfpjledahoajcppakbgilmojkaghgm%26uc"
-
unpack the extension
unzip -d memento memento.zip
15:02:16.722 [..r.operations.NERProcess] Extracting named entities in column Autore on row 4 of 4. (3ms) | |
15:02:17.029 [..r.operations.NERProcess] The extractor failed (307ms) | |
java.lang.IllegalArgumentException: dataTXT request failed. | |
at org.freeyourmetadata.ner.services.DataTXT.parseExtractionResponseEntity(DataTXT.java:69) | |
at org.freeyourmetadata.ner.services.NERServiceBase.parseExtractionResponseEntity(NERServiceBase.java:196) | |
at org.freeyourmetadata.ner.services.NERServiceBase.performExtractionRequest(NERServiceBase.java:128) | |
at org.freeyourmetadata.ner.services.NERServiceBase.extractNamedEntities(NERServiceBase.java:100) | |
at org.freeyourmetadata.ner.operations.NERProcess$Extractor.run(NERProcess.java:210) |
~ virtualenv env | |
~ source env/bin/activate | |
~ pip install git+https://github.com/nlevitt/warctools@tweaks | |
~ pip install pyOpenSSL | |
~ git clone git clone https://github.com/nlevitt/warcprox | |
~ cd warcprox | |
~ python warcprox.py --rollover-idle-time=7200 | |
2013-10-20 14:36:07,923 66818 MainThread INFO server_activate(warcprox.py:346) listening on 127.0.0.1:8080 | |
2013-10-20 14:36:07,924 66818 MainThread INFO _read_ca(warcprox.py:75) read CA key+cert from ./warcprox-ca.pem | |
2013-10-20 14:36:07,928 66818 WarcWriterThread INFO run(warcprox.py:510) WarcWriterThread starting, directory=/private/tmp/warcprox/warcs gzip=False rollover_size=1000000000 rollover_idle_time=7200 prefix=WARCPROX port=8080 |
[ | |
{ | |
"op": "core/column-addition", | |
"description": "Create column url at index 2 based on column Column 2 using expression jython:import httplib\nconn = httplib.HTTPConnection(\"dx.doi.org\")\ndoi = \"/\"+value\nconn.request(\"HEAD\", doi)\nres = conn.getresponse()\nreturn res.getheader('location')", | |
"engineConfig": { | |
"facets": [], | |
"mode": "row-based" | |
}, | |
"newColumnName": "url", | |
"columnInsertIndex": 2, |
source "http://rubygems.org" | |
gem "oai", :git => "https://github.com/tjdett/ruby-oai.git", :branch => "seamless-resumption" | |
gem "redis" | |
gem "libxml-ruby" |
#!/bin/bash | |
. heritrix.conf | |
if [ -z "$1" ] || [ -z "$2" ]; then | |
echo usage: $0 jobname seedsfile | |
exit | |
fi | |
JOB=$1 |