Skip to content

Instantly share code, notes, and snippets.

@belyaev-pa
Created September 9, 2019 13:07
Show Gist options
  • Save belyaev-pa/3287a384e2078bf7622cd53bd1e0dc77 to your computer and use it in GitHub Desktop.
Save belyaev-pa/3287a384e2078bf7622cd53bd1e0dc77 to your computer and use it in GitHub Desktop.
Test task by Beliaev for Polymedia company
# -*- coding: utf-8 -*-
import urllib.request
import os
import gzip
import lxml.etree
import pandas as pd
import matplotlib.pyplot as plt
import time
from collections import defaultdict
XML_TMP = '/home/pavel/Загрузки/dtb/dblp.xml'
OUTPUT_FILE = '/home/pavel/Загрузки/dtb/myplot.png'
GZ_URL = 'http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/dblp/dblp.xml.gz'
DTD_URL = 'http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/dblp/dblp.dtd'
def timing(f):
"""
декоратор для измерения времени выполнения функции
:param f:
:return:
"""
def wrap(*args):
time1 = time.time()
ret = f(*args)
time2 = time.time()
print('{:s} Выполнение функции заняло: {:.3f} мс'.format(f.__name__, (time2-time1)*1000.0))
return ret
return wrap
def download_gz_and_unpack(xml_tmp):
"""
функция загрузки архива с БД и его распаковки
:param xml_tmp: путь куда временно сохранить файл
:return:
"""
file_name, _headers = urllib.request.urlretrieve(GZ_URL)
with gzip.open(file_name, 'rb') as gz_file, open(xml_tmp, 'wb') as out:
out.write(gz_file.read())
os.remove(file_name)
def iterate_over_xml(xml_path):
"""
генератор по элементам xml файла
:param xml_path: путь до xml файла
:return:
"""
records = lxml.etree.iterparse(xml_path, events=("start", "end"),
dtd_validation=True, load_dtd=True)
records.resolvers.add(DTDResolver())
_, root = next(records)
start_tag = None
for event, element in records:
if event == 'start' and start_tag is None: # a new start
start_tag = element.tag
if event == 'end' and element.tag == start_tag:
yield element
start_tag = None
element.clear()
class DTDResolver(lxml.etree.Resolver):
"""
Для загрузки DTD схемы
"""
def resolve(self, system_url, public_id, context):
file_name, _headers = urllib.request.urlretrieve(DTD_URL)
return self.resolve_filename(file_name, context)
def count_books_by_years(xml_path):
"""
считаем книги по годам
:param xml_path: путь до xml файла
:return:
"""
years = defaultdict(int)
for record in iterate_over_xml(xml_path):
flag, year = False, False
for attr in record:
if attr.tag == 'booktitle':
flag = True
elif attr.tag == 'year':
year = attr.text
if flag and year:
years[year] += 1
return years
def plot_png(years_dict):
"""
функция для формирования графика
:param years_dict: слоавь содержащий книги по годам
:return: void
"""
df = pd.DataFrame([('{} г.'.format(k), v) for k, v in years_dict.items()],
columns=['year', 'book_count'])
df.sort_values(by=['year'])
print(df)
df.plot.bar(x='year',y='book_count', figsize=(9,6), logy=True,
yticks=[100,1000,10000,100000])
plt.savefig(OUTPUT_FILE)
@timing
def main():
download_gz_and_unpack(XML_TMP)
plot_png(count_books_by_years(XML_TMP))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment