Skip to content

Instantly share code, notes, and snippets.

@macleginn
Created February 3, 2019 12:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save macleginn/9fda483346c159836184a69427105b0b to your computer and use it in GitHub Desktop.
Save macleginn/9fda483346c159836184a69427105b0b to your computer and use it in GitHub Desktop.
import sqlite3
import pandas as pd
import html
import re
# Убираем <br>, <br/>, </br>; заменяем любые последовательности
# whitespace-символов на один пробел.
def normalise_ws(s):
s = re.sub(r'</?br/?>', ' ', s)
s = re.sub(r'[\n\r]+ *', ' ', s)
s = re.sub(r' {2,}', ' ', s)
return s.strip()
dump_perl = pd.read_csv(
'dump_area_perl.tsv',
delimiter='\t',
header=None
)
perl_dump_set = set(
normalise_ws(html.unescape(el.strip())) for el in dump_perl.iloc[:,3]
)
conn = sqlite3.connect('myths_logged.sqlite')
cursor = conn.cursor()
# Я использовал ту же функцию normalise_ws, когда
# загонял данные в таблицу, поэтому здесь она уже не нужна.
python_dump_set = set(html.unescape(el[0].strip()) for el in cursor.execute(
'select `regional_dump` from `dn_dump_area_from_html`')
)
perl_only = perl_dump_set - python_dump_set
to_remove = set()
# Вынимаем блоки, которые являются подблоками в питоновском дампе
with open('substrings.txt', 'w') as out:
for el1 in perl_only:
for el2 in python_dump_set:
if el1 in el2:
print(el1, file = out)
print(el2, file = out)
print('', file = out)
to_remove.add(el1)
break
perl_only = perl_only-to_remove
# Проверяем оставшееся
with open('perl_only.txt', 'w') as out:
for el in sorted(perl_only):
print(el, file = out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment