kalloc/extract.py

## extract.py
#!/usr/bin/env python3
import nltk
import re
import sys
import json
import string
from nltk.corpus import stopwords
import pymorphy2


def tokenize_me(file_text):
    tokens = nltk.word_tokenize(file_text)
    tokens = [i for i in tokens if i not in string.punctuation]

    stop_words = stopwords.words('russian')
    stop_words.extend(
        ['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на']
    )
    tokens = [i for i in tokens if i not in stop_words]
    tokens = [i.replace("«", "").replace("»", "") for i in tokens]
    words = []
    morph = pymorphy2.MorphAnalyzer()
    for token in tokens:
        p = morph.parse(token)
        if len(p) == 0:
            continue
        normal = p[0].normal_form
        if normal not in words:
            words.append(normal)
    return set(words)


if __name__ == '__main__':
    data = json.loads(open(sys.argv[1]).read())
    description = data['description']
    uls = re.findall(r'<ul>([\s\S]*?)</ul>', description, re.M)

    print("\tтребования")
    lis = re.findall(r'<li>([\s\S]*?)</li>', uls[0], re.M)
    for li in lis:
        print (li, 'to', tokenize_me(li))
    print("\tусловия")
    lis = re.findall(r'<li>([\s\S]*?)</li>', uls[0], re.M)
    for li in lis:
        print (li, 'to', tokenize_me(li))
	#!/usr/bin/env python3
	import nltk
	import re
	import sys
	import json
	import string
	from nltk.corpus import stopwords
	import pymorphy2


	def tokenize_me(file_text):
	tokens = nltk.word_tokenize(file_text)
	tokens = [i for i in tokens if i not in string.punctuation]

	stop_words = stopwords.words('russian')
	stop_words.extend(
	['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на']
	)
	tokens = [i for i in tokens if i not in stop_words]
	tokens = [i.replace("«", "").replace("»", "") for i in tokens]
	words = []
	morph = pymorphy2.MorphAnalyzer()
	for token in tokens:
	p = morph.parse(token)
	if len(p) == 0:
	continue
	normal = p[0].normal_form
	if normal not in words:
	words.append(normal)
	return set(words)


	if __name__ == '__main__':
	data = json.loads(open(sys.argv[1]).read())
	description = data['description']
	uls = re.findall(r'<ul>([\s\S]*?)</ul>', description, re.M)

	print("\tтребования")
	lis = re.findall(r'<li>([\s\S]*?)</li>', uls[0], re.M)
	for li in lis:
	print (li, 'to', tokenize_me(li))
	print("\tусловия")
	lis = re.findall(r'<li>([\s\S]*?)</li>', uls[0], re.M)
	for li in lis:
	print (li, 'to', tokenize_me(li))