Last active
April 12, 2017 15:00
-
-
Save NetBUG/05438715374b508e8b7281c56d4a2130 to your computer and use it in GitHub Desktop.
Japanese film DB grabber
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf-8 | |
import requests | |
import shutil | |
fn = "http://www.japanese-cinema-db.jp/Details?id=" | |
def saveFile(path): | |
r = requests.get(fn + str(path), stream=True) | |
if r.status_code == 200: | |
with open(str(path) + ".htm", 'wb') as f: | |
r.raw.decode_content = True | |
shutil.copyfileobj(r.raw, f) | |
if __name__ == '__main__': | |
for a in range(1, 47000): | |
saveFile(a) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf-8 | |
import re | |
#pattern = re.compile("([A-Za-zア-ヺ]+)") | |
pattern = re.compile("([A-Za-z]+)") | |
for a in range(1, 47000): | |
with open(str(a) + ".htm", 'r') as fh: | |
bOn = False | |
sOut = "" | |
sTitle = "" | |
sDate = "" | |
for line in fh: | |
if "<title>" in line: | |
sTitle += re.sub('<[^>]*>', '', line).strip() | |
if "<dt>公開年月日</dt>" in line: | |
sDate = "!!!" | |
continue | |
if sDate == "!!!": | |
sDate = re.sub('<[^>]*>', '', line.strip()).replace(" ", "") | |
if ">出演者</p>" in line: | |
bOn = True | |
if "</div>" in line: | |
bOn = False | |
if bOn: | |
sOut += line.strip() | |
sOut = re.sub('<[^>]*>', ' ', sOut) | |
#bForeign = re.findall(r"[A-z]+", sOut) | |
#print (re.findall(u"[ア-ヺ]+", sOut)) | |
bForeign = len(re.findall(u'[A-zア-ヺ]+', sOut)) > 1 | |
if bForeign: | |
print(sDate + "|http://www.japanese-cinema-db.jp/Details?id=" + str(a) + "|" + sTitle + "|" + re.sub(r"\s+", " ", sOut)) | |
#print (sOut) | |
#break |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Getting everything | |
python3 getter.py | |
# Processing | |
python3 process.py | grep 201 > foreign_2010.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment