Created
April 7, 2016 14:43
-
-
Save hletrd/90f9719188bb08954c0c8472897dd816 to your computer and use it in GitHub Desktop.
Kyohaksa textbook fetcher
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import httplib | |
import re | |
import urllib | |
import os | |
for i in range(418,1000): | |
conn = httplib.HTTPConnection('www.kyohak.co.kr') | |
conn.request('GET', '/textbook/microsite/microsite_board_list.asp?tch_div=1&sub_det_idx=%03d' % i) | |
res = conn.getresponse() | |
res = res.read().decode('euc-kr') | |
parsed = re.findall("JavaScript:FileDown\('([a-zA-Z0-9]+)','([^']+)'\)", res) | |
print i | |
if len(parsed): | |
parsed_title = re.findall(' title="([^"]+)"', res) | |
parsed_title[0] = parsed_title[0].replace(u'즐겨찾기 ', '').replace('/', ',') | |
os.mkdir('downloaded/' + str(i) + '_' + parsed_title[0]) | |
for j in parsed: | |
conn = httplib.HTTPConnection('www.kyohak.co.kr') | |
conn.request('GET', '/textbook/inc/board/File_Down.asp?Data_Idx=' + j[0] + '&File_Name=' + urllib.quote_plus(j[1].encode('euc-kr'))) | |
res = conn.getresponse() | |
output = open('downloaded/' + str(i) + '_' + parsed_title[0] + '/' + j[1], 'wb') | |
print j[1] | |
output.write(res.read()) | |
output.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment