Skip to content

Instantly share code, notes, and snippets.

@beautyfree
Forked from kalloc/extract.py
Created November 17, 2015 00:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save beautyfree/48025600f0d43c707389 to your computer and use it in GitHub Desktop.
Save beautyfree/48025600f0d43c707389 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import nltk
import re
import sys
import json
import string
from nltk.corpus import stopwords
import pymorphy2
def tokenize_me(file_text):
tokens = nltk.word_tokenize(file_text)
tokens = [i for i in tokens if i not in string.punctuation]
stop_words = stopwords.words('russian')
stop_words.extend(
['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на']
)
tokens = [i for i in tokens if i not in stop_words]
tokens = [i.replace("«", "").replace("»", "") for i in tokens]
words = []
morph = pymorphy2.MorphAnalyzer()
for token in tokens:
p = morph.parse(token)
if len(p) == 0:
continue
normal = p[0].normal_form
if normal not in words:
words.append(normal)
return set(words)
if __name__ == '__main__':
data = json.loads(open(sys.argv[1]).read())
description = data['description']
uls = re.findall(r'<ul>([\s\S]*?)</ul>', description, re.M)
print("\tтребования")
lis = re.findall(r'<li>([\s\S]*?)</li>', uls[0], re.M)
for li in lis:
print (li, 'to', tokenize_me(li))
print("\tусловия")
lis = re.findall(r'<li>([\s\S]*?)</li>', uls[0], re.M)
for li in lis:
print (li, 'to', tokenize_me(li))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment