Benno Kruit bennokr

## json2sqlschema.py
import sys
import json
import pandas as pd


def decompose_json(
    tables,
    foreignkeys,
    obj,
    name="",

## dictionary-code-triples.py
import sqlite3
db = sqlite3.connect(':memory:')
db.executescript("""
CREATE TABLE NodeStr(str);
CREATE TABLE NodeTriple(s INTEGER, p INTEGER, o INTEGER);

CREATE VIEW Triple(s,p,o) AS
SELECT sns.str AS s, pns.str AS p, ons.str AS o
FROM NodeTriple AS t
JOIN NodeStr AS sns ON (t.s = sns.rowid)

## rdflib_sqlalchemy_sqlite_test.py
import sys,os, rdflib, rdflib_sqlalchemy
rdflib_sqlalchemy.registerplugins()

ident = rdflib.URIRef("rdflib_test")
print('Table prefix:', rdflib_sqlalchemy.store.generate_interned_id(ident))

fname = 'development.sqlite'
if os.path.exists(fname):
    os.remove(fname)
dburi = rdflib.Literal(f"sqlite:///{os.getcwd()}/development.sqlite")

## code_nodes.py
"""
Parallel counter for RDF. Outputs URI/literal counts, one per line, space-separated
Usage: python code_nodes.py NWORKERS PATH.nt > counts.dsv
"""
import sys,os

try:
    _, N_WORKERS, NTPATH  = sys.argv
    N_WORKERS = int(N_WORKERS)
except Exception as e:

## terrible-triples.py
import sqlite3, string, itertools

FNAME = 'test.nt'

punct_bad = set('"\'')
punct_escaped = set(f'\\{p}' for p in set(string.punctuation))
punct = ''.join( punct_escaped - punct_bad )

db = sqlite3.connect(':memory:')
db.executescript(f"""

## db_load.py
"""
Usage: db_load.py DBDIR DATABASE
DBDIR must contain files like <table>.tsv or split files like <table>/*.tsv according to schema
"""
import sys, os, sqlite3, glob, csv, collections
import pandas as pd

DB_INIT = os.environ.get('DB_INIT', None)
if not DB_INIT:
    dir_path = os.path.dirname(os.path.realpath(__file__))

## anchor_counts.py
import sys, os
try:
    redirects_file = sys.argv[1]
    label_files = sys.argv[2:]
except:
    print('Usage: anchor_counts.py [redirects_file.nt] *[label_files.nt]\nOutput: N(uri & label) N(uri) N(label) uri label')
    sys.exit(0)

redirects = {}
for line in open(redirects_file):

## example_es_synonyms.py
from elasticsearch import Elasticsearch
es = Elasticsearch()
es = Elasticsearch(timeout=30)
index = 'test1'
es.indices.delete(index=index, ignore=[400, 404])
es.indices.create(index=index)
es.indices.close(index=index)
es.indices.put_settings(index=index,body={
    "index" : {
            "analysis" : {

## rdf-elasticsearch-index.py
'''
This makes an Elasticsearch index of RDF triples, with different ES datatypes for different RDF datatypes

https://www.w3.org/TR/rdf11-concepts/#xsd-datatypes
TODO https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis.html
'''
import sys, os, json
try:
    _, host, index = sys.argv
except Exception as e:

## README.txt
Brown clusters

They are induced as described in the paper:
	Joseph Turian, Lev-Arie Ratinov and Yoshua Bengio (2010) "WORD
	REPRESENTATIONS: A SIMPLE AND GENERAL METHOD FOR SEMI-SUPERVISED
	LEARNING",
on the RCV1 corpus, cleaned as described in the paper (roughly 37M words of News text).

	brown-rcv1.clean.tokenized-CoNLL03.txt-c*-freq1.txt
		Brown clusters for a particular number of induced classes.
	import sys
	import json
	import pandas as pd


	def decompose_json(
	tables,
	foreignkeys,
	obj,
	name="",
	import sqlite3
	db = sqlite3.connect(':memory:')
	db.executescript("""
	CREATE TABLE NodeStr(str);
	CREATE TABLE NodeTriple(s INTEGER, p INTEGER, o INTEGER);

	CREATE VIEW Triple(s,p,o) AS
	SELECT sns.str AS s, pns.str AS p, ons.str AS o
	FROM NodeTriple AS t
	JOIN NodeStr AS sns ON (t.s = sns.rowid)
	import sys,os, rdflib, rdflib_sqlalchemy
	rdflib_sqlalchemy.registerplugins()

	ident = rdflib.URIRef("rdflib_test")
	print('Table prefix:', rdflib_sqlalchemy.store.generate_interned_id(ident))

	fname = 'development.sqlite'
	if os.path.exists(fname):
	os.remove(fname)
	dburi = rdflib.Literal(f"sqlite:///{os.getcwd()}/development.sqlite")
	"""
	Parallel counter for RDF. Outputs URI/literal counts, one per line, space-separated
	Usage: python code_nodes.py NWORKERS PATH.nt > counts.dsv
	"""
	import sys,os

	try:
	_, N_WORKERS, NTPATH = sys.argv
	N_WORKERS = int(N_WORKERS)
	except Exception as e:
	import sqlite3, string, itertools

	FNAME = 'test.nt'

	punct_bad = set('"\'')
	punct_escaped = set(f'\\{p}' for p in set(string.punctuation))
	punct = ''.join( punct_escaped - punct_bad )

	db = sqlite3.connect(':memory:')
	db.executescript(f"""
	"""
	Usage: db_load.py DBDIR DATABASE
	DBDIR must contain files like <table>.tsv or split files like <table>/*.tsv according to schema
	"""
	import sys, os, sqlite3, glob, csv, collections
	import pandas as pd

	DB_INIT = os.environ.get('DB_INIT', None)
	if not DB_INIT:
	dir_path = os.path.dirname(os.path.realpath(__file__))
	import sys, os
	try:
	redirects_file = sys.argv[1]
	label_files = sys.argv[2:]
	except:
	print('Usage: anchor_counts.py [redirects_file.nt] *[label_files.nt]\nOutput: N(uri & label) N(uri) N(label) uri label')
	sys.exit(0)

	redirects = {}
	for line in open(redirects_file):
	from elasticsearch import Elasticsearch
	es = Elasticsearch()
	es = Elasticsearch(timeout=30)
	index = 'test1'
	es.indices.delete(index=index, ignore=[400, 404])
	es.indices.create(index=index)
	es.indices.close(index=index)
	es.indices.put_settings(index=index,body={
	"index" : {
	"analysis" : {
	'''
	This makes an Elasticsearch index of RDF triples, with different ES datatypes for different RDF datatypes

	https://www.w3.org/TR/rdf11-concepts/#xsd-datatypes
	TODO https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis.html
	'''
	import sys, os, json
	try:
	_, host, index = sys.argv
	except Exception as e:
	Brown clusters

	They are induced as described in the paper:
	Joseph Turian, Lev-Arie Ratinov and Yoshua Bengio (2010) "WORD
	REPRESENTATIONS: A SIMPLE AND GENERAL METHOD FOR SEMI-SUPERVISED
	LEARNING",
	on the RCV1 corpus, cleaned as described in the paper (roughly 37M words of News text).

	brown-rcv1.clean.tokenized-CoNLL03.txt-c*-freq1.txt
	Brown clusters for a particular number of induced classes.