Skip to content

Instantly share code, notes, and snippets.

@bennokr
bennokr / json2sqlschema.py
Last active October 28, 2020 13:40
JSON to SQL schema
import sys
import json
import pandas as pd
def decompose_json(
tables,
foreignkeys,
obj,
name="",
@bennokr
bennokr / dictionary-code-triples.py
Created October 28, 2019 22:18
Using Sqlite triggers for dictionary encoding triples
import sqlite3
db = sqlite3.connect(':memory:')
db.executescript("""
CREATE TABLE NodeStr(str);
CREATE TABLE NodeTriple(s INTEGER, p INTEGER, o INTEGER);
CREATE VIEW Triple(s,p,o) AS
SELECT sns.str AS s, pns.str AS p, ons.str AS o
FROM NodeTriple AS t
JOIN NodeStr AS sns ON (t.s = sns.rowid)
import sys,os, rdflib, rdflib_sqlalchemy
rdflib_sqlalchemy.registerplugins()
ident = rdflib.URIRef("rdflib_test")
print('Table prefix:', rdflib_sqlalchemy.store.generate_interned_id(ident))
fname = 'development.sqlite'
if os.path.exists(fname):
os.remove(fname)
dburi = rdflib.Literal(f"sqlite:///{os.getcwd()}/development.sqlite")
@bennokr
bennokr / code_nodes.py
Last active October 15, 2019 15:14
Parallel counter for RDF. Outputs URI/literal counts, one per line, space-separated
"""
Parallel counter for RDF. Outputs URI/literal counts, one per line, space-separated
Usage: python code_nodes.py NWORKERS PATH.nt > counts.dsv
"""
import sys,os
try:
_, N_WORKERS, NTPATH = sys.argv
N_WORKERS = int(N_WORKERS)
except Exception as e:
@bennokr
bennokr / terrible-triples.py
Created October 15, 2019 08:45
A terrible idea for a fast sqlite rdf triple store
import sqlite3, string, itertools
FNAME = 'test.nt'
punct_bad = set('"\'')
punct_escaped = set(f'\\{p}' for p in set(string.punctuation))
punct = ''.join( punct_escaped - punct_bad )
db = sqlite3.connect(':memory:')
db.executescript(f"""
@bennokr
bennokr / db_load.py
Last active May 29, 2020 07:07
Very lazy sqlite specification script
"""
Usage: db_load.py DBDIR DATABASE
DBDIR must contain files like <table>.tsv or split files like <table>/*.tsv according to schema
"""
import sys, os, sqlite3, glob, csv, collections
import pandas as pd
DB_INIT = os.environ.get('DB_INIT', None)
if not DB_INIT:
dir_path = os.path.dirname(os.path.realpath(__file__))
import sys, os
try:
redirects_file = sys.argv[1]
label_files = sys.argv[2:]
except:
print('Usage: anchor_counts.py [redirects_file.nt] *[label_files.nt]\nOutput: N(uri & label) N(uri) N(label) uri label')
sys.exit(0)
redirects = {}
for line in open(redirects_file):
from elasticsearch import Elasticsearch
es = Elasticsearch()
es = Elasticsearch(timeout=30)
index = 'test1'
es.indices.delete(index=index, ignore=[400, 404])
es.indices.create(index=index)
es.indices.close(index=index)
es.indices.put_settings(index=index,body={
"index" : {
"analysis" : {
'''
This makes an Elasticsearch index of RDF triples, with different ES datatypes for different RDF datatypes
https://www.w3.org/TR/rdf11-concepts/#xsd-datatypes
TODO https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis.html
'''
import sys, os, json
try:
_, host, index = sys.argv
except Exception as e:
@bennokr
bennokr / README.txt
Last active October 16, 2017 12:07
Brown Clusters
Brown clusters
They are induced as described in the paper:
Joseph Turian, Lev-Arie Ratinov and Yoshua Bengio (2010) "WORD
REPRESENTATIONS: A SIMPLE AND GENERAL METHOD FOR SEMI-SUPERVISED
LEARNING",
on the RCV1 corpus, cleaned as described in the paper (roughly 37M words of News text).
brown-rcv1.clean.tokenized-CoNLL03.txt-c*-freq1.txt
Brown clusters for a particular number of induced classes.