Skip to content

Instantly share code, notes, and snippets.

@Vnr Vnr/pamyatnaroda.py Secret
Last active May 16, 2019

Embed
What would you like to do?
#!/usr/bin/env python
# coding: utf8
from __future__ import print_function
_INFO = u"""
Нужен python2 (http://www.python.org/downloads/) и библиотека requests (pip install requests)
У скрипта 2 режима работы.
1) Скачивание всех страниц по ID документа:
pamyatnaroda.py -id 113839803
2) Скачивание по url изображения:
pamyatnaroda.py -url URL [-s STARTPAGE] [-e ENDPAGE]
Например, следующая команда выкачает первые 300 страниц дела 1052-0000001-0021.
pamyatnaroda.py -url "http://cdn.pamyat-naroda.ru/imageload/Передача_023_КП097Р-С28/1052-0000001-0021/00000173.jpg" -s 1 -e 300
Для просмотра больших карт рекомендуется использовать программу Picasa https://picasa.google.com/
"""
import os
import locale
import argparse
from contextlib import closing
import requests
import time
import random
import codecs
import re
import sys
# по умолчанию сохранение идет в папку, откуда запущен скрипт
OUT_FOLDER = u''
API = 'https://cdn.pamyat-naroda.ru/ind/pamyat/_search'
IMG_PATH_PREFIX = 'https://cdn.pamyat-naroda.ru/imageload/'
# маскируемся под IE
headers = {u'User-Agent':
u'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko'}
s = requests.Session()
s.headers.update(headers)
proxies = {
u'http': 'http://127.0.0.1:8888',
u'https': 'http://127.0.0.1:8888'
}
#s.proxies = proxies
s.verify = False
requests.packages.urllib3.disable_warnings()
def create_dir(path):
try:
os.makedirs(path)
except OSError:
if not os.path.isdir(path):
raise
def getcookies():
s.cookies.clear()
s.get(u'http://pamyat-naroda.ru/')
print(u'Got new cookies')
def get_url(_id):
"""Finds all pages by document id
:param _id: int
:returns: (first_page_url, first_page_num, last_page_num)
"""
query = {
"query": {
"term": {
"document_id": _id
}
},
"size": 10000
}
pages = []
try:
r = s.post(API, json=query)
j = r.json()
pages = [x['_source']["image_path"].replace('\\', '/') for x in j['hits']['hits']]
except Exception:
pass
try:
r2 = s.get('https://cdn.pamyat-naroda.ru/ind/pamyat/document/%d' % _id)
j2 = r2.json()
pages.append(j2['_source']['image_path'].replace('\\', '/'))
except Exception:
pass
if not pages:
print(u'Can\'t find image url for this document ID')
exit(15)
pages.sort()
url = IMG_PATH_PREFIX + pages[0]
startpage = int(re.findall('/(\d+)\.jpg', pages[0], re.IGNORECASE)[0])
endpage = int(re.findall('/(\d+)\.jpg', pages[-1], re.IGNORECASE)[0])
print(url, startpage, endpage)
return url, startpage, endpage
def download_image(url, filename):
print(u'Downloading %s' % url)
with closing(s.get(url, stream=True,
headers={u'Referer': u'http://pamyat-naroda.ru/'},
allow_redirects=False,
timeout=300
)) as r:
if r.status_code != 200:
r.raw.close() # workaround http://github.com/kennethreitz/requests/issues/2593
time.sleep(random.random() * 2)
raise requests.exceptions.HTTPError(u'Error: got %d response_code' % r.status_code)
with open(filename, u'wb') as out_file:
for chunk in r.iter_content(100000):
out_file.write(chunk)
with codecs.open(os.path.join(os.path.dirname(filename), 'urls.log'), 'a', encoding='utf8') as f:
f.write(url + u'\n')
print(u'ok')
def pamyatnaroda(url, startpage=-1, endpage=-1, out_folder=OUT_FOLDER):
assert startpage <= endpage
out_folder = os.path.abspath(out_folder)
create_dir(out_folder)
opis = url.split(u'/')[-2] # 143-0000001-0009
sub_folder = os.path.join(out_folder, opis)
create_dir(sub_folder)
print('Output folder: %s\n' % sub_folder)
# getcookies()
if startpage == -1 or endpage == -1:
download_image(
url, os.path.join(sub_folder, u'-'.join(url.split(u'/')[-2:])))
return
filepattern = os.path.join(sub_folder, opis + u'-{:08}.jpg')
urlpattern = u'/'.join(url.split(u'/')[:-1]) + u'/{:08}' + url[-4:]
for i in range(startpage, endpage + 1):
filename = filepattern.format(i)
current_url = urlpattern.format(i)
try:
download_image(current_url, filename)
# except requests.exceptions.HTTPError as e:
# print(e)
# print(u'Trying to renew cookies...')
# getcookies()
# try:
# downloadImage(current_url, filename)
except requests.exceptions.HTTPError as e:
print(e)
print(u'Let\'s try to change extension\'s case')
try:
download_image(current_url[:-3] + current_url[-3:].swapcase(), filename)
except requests.exceptions.HTTPError as e:
print(e)
print(u'Download of page %d failed\n' % i)
with codecs.open(os.path.join(os.path.dirname(filename), 'urls.log'), 'a', encoding='utf8') as f:
f.write(current_url + u' failed\n')
except requests.exceptions.ConnectionError as e:
# print(e)
print(u'Загрузка не удалась. Невозможно соединиться с сервером изображений')
sys.exit(-1)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=u'Pamyat-naroda downloader',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=_INFO)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("-id", type=int, help='Document id')
group.add_argument('-url', type=str, help='Image url')
parser.add_argument('-s', '--startpage', nargs='?', type=int, default=-1)
parser.add_argument('-e', '--endpage', nargs='?', type=int, default=-1)
parser.add_argument(
'-d', '--dir', type=str, help='directory to save to', default=OUT_FOLDER)
args = parser.parse_args()
for k, v in args.__dict__.items():
if isinstance(v, str) and sys.version_info < (3, 0):
args.__dict__[k] = v.decode(locale.getpreferredencoding())
# getcookies()
if args.id:
args.url, args.startpage, args.endpage = get_url(args.id)
pamyatnaroda(args.url, args.startpage, args.endpage, args.dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.