Skip to content

Instantly share code, notes, and snippets.

@NetBUG
Last active April 12, 2017 15:00
Show Gist options
  • Save NetBUG/05438715374b508e8b7281c56d4a2130 to your computer and use it in GitHub Desktop.
Save NetBUG/05438715374b508e8b7281c56d4a2130 to your computer and use it in GitHub Desktop.
Japanese film DB grabber
#coding=utf-8
import requests
import shutil
fn = "http://www.japanese-cinema-db.jp/Details?id="
def saveFile(path):
r = requests.get(fn + str(path), stream=True)
if r.status_code == 200:
with open(str(path) + ".htm", 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
if __name__ == '__main__':
for a in range(1, 47000):
saveFile(a)
#coding=utf-8
import re
#pattern = re.compile("([A-Za-zア-ヺ]+)")
pattern = re.compile("([A-Za-z]+)")
for a in range(1, 47000):
with open(str(a) + ".htm", 'r') as fh:
bOn = False
sOut = ""
sTitle = ""
sDate = ""
for line in fh:
if "<title>" in line:
sTitle += re.sub('<[^>]*>', '', line).strip()
if "<dt>公開年月日</dt>" in line:
sDate = "!!!"
continue
if sDate == "!!!":
sDate = re.sub('<[^>]*>', '', line.strip()).replace("&nbsp;", "")
if ">出演者</p>" in line:
bOn = True
if "</div>" in line:
bOn = False
if bOn:
sOut += line.strip()
sOut = re.sub('<[^>]*>', ' ', sOut)
#bForeign = re.findall(r"[A-z]+", sOut)
#print (re.findall(u"[ア-ヺ]+", sOut))
bForeign = len(re.findall(u'[A-zア-ヺ]+', sOut)) > 1
if bForeign:
print(sDate + "|http://www.japanese-cinema-db.jp/Details?id=" + str(a) + "|" + sTitle + "|" + re.sub(r"\s+", " ", sOut))
#print (sOut)
#break
# Getting everything
python3 getter.py
# Processing
python3 process.py | grep 201 > foreign_2010.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment