This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import json | |
import pandas as pd | |
def decompose_json( | |
tables, | |
foreignkeys, | |
obj, | |
name="", |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sqlite3 | |
db = sqlite3.connect(':memory:') | |
db.executescript(""" | |
CREATE TABLE NodeStr(str); | |
CREATE TABLE NodeTriple(s INTEGER, p INTEGER, o INTEGER); | |
CREATE VIEW Triple(s,p,o) AS | |
SELECT sns.str AS s, pns.str AS p, ons.str AS o | |
FROM NodeTriple AS t | |
JOIN NodeStr AS sns ON (t.s = sns.rowid) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys,os, rdflib, rdflib_sqlalchemy | |
rdflib_sqlalchemy.registerplugins() | |
ident = rdflib.URIRef("rdflib_test") | |
print('Table prefix:', rdflib_sqlalchemy.store.generate_interned_id(ident)) | |
fname = 'development.sqlite' | |
if os.path.exists(fname): | |
os.remove(fname) | |
dburi = rdflib.Literal(f"sqlite:///{os.getcwd()}/development.sqlite") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Parallel counter for RDF. Outputs URI/literal counts, one per line, space-separated | |
Usage: python code_nodes.py NWORKERS PATH.nt > counts.dsv | |
""" | |
import sys,os | |
try: | |
_, N_WORKERS, NTPATH = sys.argv | |
N_WORKERS = int(N_WORKERS) | |
except Exception as e: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sqlite3, string, itertools | |
FNAME = 'test.nt' | |
punct_bad = set('"\'') | |
punct_escaped = set(f'\\{p}' for p in set(string.punctuation)) | |
punct = ''.join( punct_escaped - punct_bad ) | |
db = sqlite3.connect(':memory:') | |
db.executescript(f""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Usage: db_load.py DBDIR DATABASE | |
DBDIR must contain files like <table>.tsv or split files like <table>/*.tsv according to schema | |
""" | |
import sys, os, sqlite3, glob, csv, collections | |
import pandas as pd | |
DB_INIT = os.environ.get('DB_INIT', None) | |
if not DB_INIT: | |
dir_path = os.path.dirname(os.path.realpath(__file__)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, os | |
try: | |
redirects_file = sys.argv[1] | |
label_files = sys.argv[2:] | |
except: | |
print('Usage: anchor_counts.py [redirects_file.nt] *[label_files.nt]\nOutput: N(uri & label) N(uri) N(label) uri label') | |
sys.exit(0) | |
redirects = {} | |
for line in open(redirects_file): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from elasticsearch import Elasticsearch | |
es = Elasticsearch() | |
es = Elasticsearch(timeout=30) | |
index = 'test1' | |
es.indices.delete(index=index, ignore=[400, 404]) | |
es.indices.create(index=index) | |
es.indices.close(index=index) | |
es.indices.put_settings(index=index,body={ | |
"index" : { | |
"analysis" : { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
This makes an Elasticsearch index of RDF triples, with different ES datatypes for different RDF datatypes | |
https://www.w3.org/TR/rdf11-concepts/#xsd-datatypes | |
TODO https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis.html | |
''' | |
import sys, os, json | |
try: | |
_, host, index = sys.argv | |
except Exception as e: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Brown clusters | |
They are induced as described in the paper: | |
Joseph Turian, Lev-Arie Ratinov and Yoshua Bengio (2010) "WORD | |
REPRESENTATIONS: A SIMPLE AND GENERAL METHOD FOR SEMI-SUPERVISED | |
LEARNING", | |
on the RCV1 corpus, cleaned as described in the paper (roughly 37M words of News text). | |
brown-rcv1.clean.tokenized-CoNLL03.txt-c*-freq1.txt | |
Brown clusters for a particular number of induced classes. |