Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save rominf/fd6545c659689d0b899b99af14636cbe to your computer and use it in GitHub Desktop.
Save rominf/fd6545c659689d0b899b99af14636cbe to your computer and use it in GitHub Desktop.
from pony import orm
import lmdb
env = lmdb.open('../conceptnet-lite-data/conceptnet-lmdb.db', map_size=16*1024*1024*1024, max_dbs=2)
start_db = env.open_db(b'start')
db = orm.Database()
class Label(db.Entity):
text = orm.Required(str)
language = orm.Required(str)
from pathlib import Path
db.bind(filename=str(Path('../conceptnet-lite-data/normalization-test.db').resolve()), provider='sqlite', create_db=True)
db.generate_mapping(create_tables=True)
import csv
from pathlib import Path
from typing import Generator, Optional, Tuple, Union
PathOrStr = Union[Path, str]
def edges_from_dump_by_parts_generator(
path: PathOrStr,
count: Optional[int] = None,
) -> Generator[Tuple[str, str, str, str], None, None]:
with open(str(path), newline='') as f:
reader = csv.reader(f, delimiter='\t')
for i, row in enumerate(reader):
yield row[1:5]
if i == count:
break
%%time
i = 0
with env.begin(start_db, write=True) as txn:
for relation_name, start_uri, end_uri, edge_etc_json in edges_from_dump_by_parts_generator('../conceptnet-lite-data/conceptnet-assertions-5.7.0.csv'):
i += 1
language_b, start_b = [x.encode('utf8') for x in start_uri.split('/', maxsplit=4)[2:4]]
exising_start_language_b = txn.get(start_b)
if exising_start_language_b != language_b:
txn.put(start_b, language_b)
if i % 1000000 == 0:
print(i)
%%time
with env.begin(start_db) as txn:
cursor = txn.cursor()
total_count = txn.stat(start_db)['entries']
i = 0
while i < total_count:
with orm.db_session:
for key, value in cursor:
i += 1
text = key.decode('utf8')
language = value.decode('utf8')
Label(text=text, language=language)
if i % 1000000 == 0:
print(i)
cursor.next()
break
with env.begin(start_db) as txn:
cursor = txn.cursor()
print(txn.stat(start_db)['entries'])
Label.select().count()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment