Skip to content

Instantly share code, notes, and snippets.

@kuk
Last active August 10, 2018 08:06
Show Gist options
  • Save kuk/40344ddc6ef9a6807c349610c4a1e4ca to your computer and use it in GitHub Desktop.
Save kuk/40344ddc6ef9a6807c349610c4a1e4ca to your computer and use it in GitHub Desktop.
.ipynb_checkpoints/
PullentiPython/
news.txt
import sys
sys.path.append('PullentiPython')
import json
from collections import namedtuple, Counter, defaultdict
from random import seed, sample
from ipymarkup import (
Span as MarkupSpan,
AsciiMarkup,
LineMarkup,
LineLabelMarkup
)
from tqdm import tqdm as log_progress
def load_lines(path):
with open(path) as file:
for line in file:
yield line.rstrip('\n')
def format_json(data):
return json.dumps(data, indent=2, ensure_ascii=False)
def show_json(data):
print(format_json(data))
##########
#
# UTILS
#
#########
from collections import OrderedDict
def assert_type(item, types):
if not isinstance(item, types):
if not isinstance(types, tuple):
types = [types]
raise TypeError('expected {types}, got {type}'.format(
types=' or '.join(_.__name__ for _ in types),
type=type(item).__name__
))
def assert_not_empty(item):
if len(item) == 0:
raise ValueError('expected not empty')
def assert_one_of(item, items):
if item not in items:
raise ValueError('{item!r} not in {items!r}'.format(
item=item,
items=items
))
def jsonify(record):
data = OrderedDict()
for key in record.__attributes__:
value = getattr(record, key)
if isinstance(value, list):
value = [jsonify(_) for _ in value]
elif isinstance(value, Record):
value = value.as_json
data[key] = value
return data
class Record(object):
__attributes__ = []
def __eq__(self, other):
return (
type(self) == type(other)
and all(
(getattr(self, _) == getattr(other, _))
for _ in self.__attributes__
)
)
def __ne__(self, other):
return not self == other
def __iter__(self):
return (getattr(self, _) for _ in self.__attributes__)
def __hash__(self):
return hash(tuple(self))
@property
def as_json(self):
return jsonify(self)
def __repr__(self):
name = self.__class__.__name__
args = ', '.join(
'{key}={value!r}'.format(
key=_,
value=getattr(self, _)
)
for _ in self.__attributes__
)
return '{name}({args})'.format(
name=name,
args=args
)
def _repr_pretty_(self, printer, cycle):
name = self.__class__.__name__
if cycle:
printer.text('{name}(...)'.format(name=name))
else:
printer.text('{name}('.format(name=name))
keys = self.__attributes__
size = len(keys)
if size:
with printer.indent(4):
printer.break_()
for index, key in enumerate(keys):
printer.text(key + '=')
value = getattr(self, key)
printer.pretty(value)
if index < size - 1:
printer.text(',')
printer.break_()
printer.break_()
printer.text(')')
##############
#
# LANG
#
##########
from pullenti.morph.MorphLang import MorphLang
from pullenti.morph.Morphology import Morphology
from pullenti.morph.Explanatory import Explanatory
RU = 'RU'
UA = 'UA'
BY = 'BY'
EN = 'EN'
IT = 'IT'
KZ = 'KZ'
LANGS = {RU, UA, BY, EN, IT, KZ}
DEFAULT_LANGS = {RU, EN}
def langs_to_raw(langs):
raw = MorphLang()
for lang in langs:
lang = getattr(MorphLang, lang)
raw |= lang
return raw
def raw_to_langs(raw):
langs = str(raw) # RU;EN
langs = (
langs.split(';')
if langs
else []
)
for lang in langs:
assert_one_of(lang, LANGS)
return set(langs)
def loaded_langs():
raw = Morphology._get_loaded_languages()
return raw_to_langs(raw)
def unload_langs(langs):
raw = langs_to_raw(langs)
Morphology.unload_languages(raw)
Explanatory.unload_languages(raw)
def load_langs(langs):
raw = langs_to_raw(langs)
Morphology.load_languages(raw)
Explanatory.load_languages(raw)
def set_langs(langs):
langs = set(langs)
assert_not_empty(langs)
for lang in langs:
assert_one_of(lang, LANGS)
missing = loaded_langs() - langs
unload_langs(missing)
load_langs(langs)
##########
#
# PREPROCESS
#
#############
from pullenti.morph.internal.UnicodeInfo import UnicodeInfo
VALID = {_.uni_char for _ in UnicodeInfo.ALL_CHARS}
def preprocess(text):
return ''.join(_ for _ in text if _ in VALID)
###########
#
# PROCESSOR
#
###########
from pullenti.ner.Sdk import Sdk
from pullenti.ner.Processor import Processor as RawProcessor
from pullenti.ner.SourceOfAnalysis import SourceOfAnalysis
from pullenti.ner.ProcessorService import ProcessorService
from pullenti.ner.money.MoneyAnalyzer import MoneyAnalyzer
from pullenti.ner.date.DateAnalyzer import DateAnalyzer
from pullenti.ner.geo.GeoAnalyzer import GeoAnalyzer
from pullenti.ner._org.OrganizationAnalyzer import OrganizationAnalyzer
from pullenti.ner.person.PersonAnalyzer import PersonAnalyzer
PERSON = PersonAnalyzer.ANALYZER_NAME
ORGANIZATION = OrganizationAnalyzer.ANALYZER_NAME
GEO = GeoAnalyzer.ANALYZER_NAME
DATE = DateAnalyzer.ANALYZER_NAME
MONEY = MoneyAnalyzer.ANALYZER_NAME
ANALYZERS = {
PERSON,
ORGANIZATION,
GEO,
DATE,
MONEY
}
def select_analyzers(selected):
for analyzer in ProcessorService._get_analyzers():
if analyzer.name in selected:
analyzer = analyzer.clone()
if analyzer is not None: # TODO why would it happen?
yield analyzer
class Processor(Record):
__attributes__ = ['analyzers']
def __init__(self, analyzers):
for analyzer in analyzers:
assert_one_of(analyzer, ANALYZERS)
self.analyzers = analyzers
langs = loaded_langs() or DEFAULT_LANGS
raw = langs_to_raw(langs)
# TODO maybe cache inits
Sdk.initialize(raw)
self.raw = RawProcessor()
for analyzer in select_analyzers(self.analyzers):
self.raw.add_analyzer(analyzer)
def __call__(self, text):
sofa = SourceOfAnalysis(text)
raw = self.raw.process(sofa)
return convert_result(raw)
###########
#
# REFERENT
#
#############
from pullenti.ner.Referent import Referent as RawReferent
from pullenti.ner.person.PersonReferent import PersonReferent as RawPersonReferent
from pullenti.ner.person.PersonPropertyReferent import PersonPropertyReferent as RawPersonPropertyReferent
from pullenti.ner.person.PersonIdentityReferent import PersonIdentityReferent as RawPersonIdentityReferent
from pullenti.ner._org.OrganizationReferent import OrganizationReferent as RawOrganizationReferent
from pullenti.ner.geo.GeoReferent import GeoReferent as RawGeoReferent
from pullenti.ner.date.DateReferent import DateReferent as RawDateReferent
from pullenti.ner.date.DateRangeReferent import DateRangeReferent as RawDateRangeReferent
from pullenti.ner.money.MoneyReferent import MoneyReferent as RawMoneyReferent
from pullenti.ner.phone.PhoneReferent import PhoneReferent as RawPhoneReferent
class Slot(Record):
__attributes__ = ['key', 'value']
def __init__(self, key, value):
self.key = key
self.value = value
class Referent(Record):
__attributes__ = ['label', 'slots']
raw = None
def __init__(self, label, slots=()):
self.label = label
self.slots = slots
def slot_property(key):
@property
def get_first_slot(referent):
for slot in referent.slots:
if slot.key == key:
return slot.value
return get_first_slot
def raw_property(method):
@property
def get_raw_property(referent):
raw = referent.raw
return method.fget(raw)
return get_raw_property
class PersonReferent(Referent):
sex = slot_property(RawPersonReferent.ATTR_SEX)
indentity = slot_property(RawPersonReferent.ATTR_IDENTITY)
firstname = slot_property(RawPersonReferent.ATTR_FIRSTNAME)
middlename = slot_property(RawPersonReferent.ATTR_MIDDLENAME)
lastname = slot_property(RawPersonReferent.ATTR_LASTNAME)
nickname = slot_property(RawPersonReferent.ATTR_NICKNAME)
attribute = slot_property(RawPersonReferent.ATTR_ATTR)
age = raw_property(RawPersonReferent.age)
born = slot_property(RawPersonReferent.ATTR_BORN)
die = slot_property(RawPersonReferent.ATTR_DIE)
contact = slot_property(RawPersonReferent.ATTR_CONTACT)
iddoc = slot_property(RawPersonReferent.ATTR_IDDOC)
class PersonPropertyReferent(Referent):
name = raw_property(RawPersonPropertyReferent.name)
attribute = slot_property(RawPersonPropertyReferent.ATTR_ATTR)
ref = slot_property(RawPersonPropertyReferent.ATTR_REF)
higher = raw_property(RawPersonPropertyReferent.ATTR_HIGHER)
class PersonIdentityReferent(Referent):
type = raw_property(RawPersonIdentityReferent.typ)
number = raw_property(RawPersonIdentityReferent.number)
date = slot_property(RawPersonIdentityReferent.ATTR_DATE)
org = slot_property(RawPersonIdentityReferent.ATTR_ORG)
state = raw_property(RawPersonIdentityReferent.state)
address = raw_property(RawPersonIdentityReferent.address)
class OrganizationReferent(Referent):
type = slot_property(RawOrganizationReferent.ATTR_TYPE)
number = raw_property(RawOrganizationReferent.number)
eponym = slot_property(RawOrganizationReferent.ATTR_EPONYM)
higher = raw_property(RawOrganizationReferent.higher)
owner = raw_property(RawOrganizationReferent.owner)
geo = slot_property(RawOrganizationReferent.ATTR_GEO)
kladr = slot_property(RawOrganizationReferent.ATTR_KLADR)
misc = slot_property(RawOrganizationReferent.ATTR_MISC)
profile = slot_property(RawOrganizationReferent.ATTR_PROFILE)
inn = raw_property(RawOrganizationReferent.inn)
ogrn = raw_property(RawOrganizationReferent.ogrn)
names = raw_property(RawOrganizationReferent.names)
profiles = raw_property(RawOrganizationReferent.profiles)
types = raw_property(RawOrganizationReferent.types)
kind = raw_property(RawOrganizationReferent.kind)
class GeoReferent(Referent):
name = slot_property(RawGeoReferent.ATTR_NAME)
type = slot_property(RawGeoReferent.ATTR_TYPE)
alpha2 = slot_property(RawGeoReferent.ATTR_ALPHA2)
higher = slot_property(RawGeoReferent.ATTR_HIGHER)
ref = slot_property(RawGeoReferent.ATTR_REF)
fias = slot_property(RawGeoReferent.ATTR_FIAS)
bti = slot_property(RawGeoReferent.ATTR_BTI)
types = raw_property(RawGeoReferent.typs)
class DateReferent(Referent):
as_datetime = raw_property(RawDateReferent.dt)
century = raw_property(RawDateReferent.century)
year = raw_property(RawDateReferent.year)
month = raw_property(RawDateReferent.month)
day = raw_property(RawDateReferent.day)
day_of_week = raw_property(RawDateReferent.day_of_week)
hour = raw_property(RawDateReferent.hour)
minute = raw_property(RawDateReferent.minute)
second = raw_property(RawDateReferent.second)
higher = raw_property(RawDateReferent.higher)
pointer = raw_property(RawDateReferent.pointer)
class DateRangeReferent(Referent):
from_ = raw_property(RawDateRangeReferent.date_from)
to = raw_property(RawDateRangeReferent.date_to)
class MoneyReferent(Referent):
currency = raw_property(RawMoneyReferent.currency)
value = raw_property(RawMoneyReferent.value)
alt_value = raw_property(RawMoneyReferent.alt_value)
rest = raw_property(RawMoneyReferent.rest)
alt_rest = raw_property(RawMoneyReferent.alt_rest)
real_value = raw_property(RawMoneyReferent.real_value)
class PhoneReferent(Referent):
number = raw_property(RawPhoneReferent.number)
add_number = raw_property(RawPhoneReferent.add_number)
country_code = raw_property(RawPhoneReferent.country_code)
kind = raw_property(RawPhoneReferent.kind)
REFERENTS = {
RawPersonReferent: PersonReferent,
RawPersonPropertyReferent: PersonPropertyReferent,
RawPersonIdentityReferent: PersonIdentityReferent,
RawOrganizationReferent: OrganizationReferent,
RawGeoReferent: GeoReferent,
RawDateReferent: DateReferent,
RawDateRangeReferent: DateRangeReferent,
RawMoneyReferent: MoneyReferent,
RawPhoneReferent: PhoneReferent,
}
def convert_referent(raw):
Raw = type(raw)
Referent = REFERENTS.get(Raw)
if Referent:
referent = Referent(raw.type_name)
referent.raw = raw
return referent
raise TypeError('not supported type: {type}'.format(
type=type(raw)
))
def convert_slots(raw, referents):
for slot in raw:
key = slot.type_name
value = slot.value
if isinstance(value, RawReferent):
value_id = id(value)
if value_id not in referents:
# TODO rare
continue
value = referents[value_id]
yield Slot(key, value)
def convert_referents(raws):
referents = {}
for raw in raws:
raw_id = id(raw)
if raw_id not in referents:
referent = convert_referent(raw)
referents[raw_id] = referent
for raw in raws:
slots = list(convert_slots(raw.slots, referents))
referent = referents[id(raw)]
referent.slots = slots
return referents
############
#
# RESULT
#
############
from pullenti.ner.ReferentToken import ReferentToken
class Span(Record):
__attributes__ = ['start', 'stop']
def __init__(self, start, stop):
self.start = start
self.stop = stop
class Match(Record):
__attributes__ = ['referent', 'span', 'children']
def __init__(self, referent, span, children):
assert_type(referent, Referent)
self.referent = referent
assert_type(span, Span)
self.span = span
for child in children:
assert_type(child, Match)
self.children = children
def walk(self):
yield self
for child in self.children:
for item in child.walk():
yield item
def get_match(token, referents):
referent = referents[id(token.referent)]
start = token.begin_token
stop = token.end_token
span = Span(start.begin_char, stop.end_char + 1)
children = list(get_matches(start, stop, referents))
return Match(referent, span, children)
def get_matches(token, stop=None, referents=None):
while token:
if isinstance(token, ReferentToken):
yield get_match(token, referents)
if token == stop:
break
token = token.next0_
def convert_result(raw):
referents = convert_referents(raw.entities)
matches = list(get_matches(raw.first_token, referents=referents))
result = Result(matches)
result.raw = raw
return result
class Result(Record):
__attributes__ = ['matches']
raw = None
def __init__(self, matches):
self.matches = matches
def walk(self):
for match in self.matches:
for item in match.walk():
yield item
@property
def graph(self):
return graph_result(self)
############
#
# GRAPH
#
###########
from subprocess import Popen, PIPE
BLUE = '#aec7e8'
ORANGE = '#ffbb78'
GREEN = '#dbdb8d'
RED = '#ff9896'
PURPLE = '#f7b6d2'
SILVER = '#eeeeee'
GRAY = 'gray'
DARKGRAY = '#888888'
def dot2svg(source):
process = Popen(
['dot', '-T', 'svg'],
stdin=PIPE, stdout=PIPE, stderr=PIPE
)
output, error = process.communicate(source.encode('utf8'))
if process.returncode != 0:
raise ValueError(error)
return output.decode('utf8')
class style(Record):
__attributes__ = ['attributes']
def __init__(self, **attributes):
self.attributes = attributes
def quote(self, value):
value = str(value)
replace = {
'"': r'\"',
'\n': r'\n',
'\r': r'\r'
}
for a, b in replace.items():
value = value.replace(a, b)
return '"' + value + '"'
def __str__(self):
return ', '.join(
'{key}={value}'.format(
key=key,
value=self.quote(value)
)
for key, value in self.attributes.items()
)
class Node(Record):
__attributes__ = ['item', 'style']
def __init__(self, item, style):
self.item = item
self.style = style
def __hash__(self):
return id(self.item)
class Edge(Record):
__attributes__ = ['source', 'target', 'style']
def __init__(self, source, target, style):
self.source = source
self.target = target
self.style = style
def __hash__(self):
return id(self.source) ^ id(self.target)
class Graph(Record):
__attributes__ = ['nodes', 'edges']
graph_style = style(
margin=0,
nodesep=0,
ranksep=0,
splines='splines',
layout='neato',
overlap='compress',
)
node_style = style(
shape='box',
height=0,
width=0,
fontname='sans',
fontsize=10,
color='none',
style='filled',
fillcolor=SILVER
)
edge_style = style(
fontname='sans',
fontsize=8,
fontcolor=GRAY,
arrowsize=0.3,
color=GRAY
)
def __init__(self):
self.nodes = set()
self.edges = set()
self.ids = {}
def add_node(self, item, style=None):
node = Node(item, style)
self.nodes.add(node)
def add_edge(self, source, target, style=None):
edge = Edge(source, target, style)
self.edges.add(edge)
def id(self, item):
item_id = id(item)
if item_id not in self.ids:
self.ids[item_id] = len(self.ids)
return self.ids[item_id]
@property
def source(self):
yield 'digraph G {'
yield 'graph [{graph_style}];'.format(graph_style=str(self.graph_style))
yield 'node [{node_style}];'.format(node_style=str(self.node_style))
yield 'edge [{edge_style}];'.format(edge_style=str(self.edge_style))
for node in self.nodes:
pattern = (
'{index} [{style}];'
if node.style
else '{index}'
)
yield pattern.format(
index=self.id(node.item),
style=str(node.style)
)
for edge in self.edges:
pattern = (
'{source} -> {target} [{style}];'
if edge.style
else '{source} -> {target};'
)
yield pattern.format(
source=self.id(edge.source),
target=self.id(edge.target),
style=str(edge.style)
)
yield '}'
def _repr_svg_(self):
return dot2svg('\n'.join(self.source))
def graph_result(result):
graph = Graph()
for match in result.walk():
source = match.referent
for key, target in source.slots:
graph.add_edge(
source,
target,
style(
label=key
)
)
graph.add_node(
source,
style(
label=source.label,
fillcolor=BLUE
)
)
if isinstance(target, Referent):
color = BLUE
label = target.label
else:
color = SILVER
label = target
graph.add_node(
target,
style(
label=label,
fillcolor=color
)
)
return graph
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment