NetBUG/getter.py

## getter.py
#coding=utf-8
import requests
import shutil

fn = "http://www.japanese-cinema-db.jp/Details?id="

def saveFile(path):
    r = requests.get(fn + str(path), stream=True)
    if r.status_code == 200:
      with open(str(path) + ".htm", 'wb') as f:
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)

if __name__ == '__main__':
    for a in range(1, 47000):
      saveFile(a)

## process.py
#coding=utf-8
import re
#pattern = re.compile("([A-Za-zア-ヺ]+)")
pattern = re.compile("([A-Za-z]+)")

for a in range(1, 47000):
  with open(str(a) + ".htm", 'r') as fh:
    bOn = False
    sOut = ""
    sTitle = ""
    sDate = ""
    for line in fh:
      if "<title>" in line:
        sTitle += re.sub('<[^>]*>', '', line).strip()
      if "<dt>公開年月日</dt>" in line:
        sDate = "!!!"
        continue
      if sDate == "!!!":
        sDate = re.sub('<[^>]*>', '', line.strip()).replace("&nbsp;", "")
      if ">出演者</p>" in line:
        bOn = True
      if "</div>" in line:
        bOn = False
      if bOn:
        sOut += line.strip()
    sOut = re.sub('<[^>]*>', ' ', sOut)
    #bForeign = re.findall(r"[A-z]+", sOut)
    #print (re.findall(u"[ア-ヺ]+", sOut))
    bForeign = len(re.findall(u'[A-zア-ヺ]+', sOut)) > 1
    if bForeign:
      print(sDate + "|http://www.japanese-cinema-db.jp/Details?id=" + str(a) + "|" + sTitle + "|" + re.sub(r"\s+", " ", sOut))
    #print (sOut)
    #break

## run.sh
# Getting everything
python3 getter.py

# Processing
python3 process.py | grep 201 > foreign_2010.txt
	#coding=utf-8
	import requests
	import shutil

	fn = "http://www.japanese-cinema-db.jp/Details?id="

	def saveFile(path):
	r = requests.get(fn + str(path), stream=True)
	if r.status_code == 200:
	with open(str(path) + ".htm", 'wb') as f:
	r.raw.decode_content = True
	shutil.copyfileobj(r.raw, f)

	if __name__ == '__main__':
	for a in range(1, 47000):
	saveFile(a)
	#coding=utf-8
	import re
	#pattern = re.compile("([A-Za-zア-ヺ]+)")
	pattern = re.compile("([A-Za-z]+)")

	for a in range(1, 47000):
	with open(str(a) + ".htm", 'r') as fh:
	bOn = False
	sOut = ""
	sTitle = ""
	sDate = ""
	for line in fh:
	if "<title>" in line:
	sTitle += re.sub('<[^>]*>', '', line).strip()
	if "<dt>公開年月日</dt>" in line:
	sDate = "!!!"
	continue
	if sDate == "!!!":
	sDate = re.sub('<[^>]*>', '', line.strip()).replace(" ", "")
	if ">出演者</p>" in line:
	bOn = True
	if "</div>" in line:
	bOn = False
	if bOn:
	sOut += line.strip()
	sOut = re.sub('<[^>]*>', ' ', sOut)
	#bForeign = re.findall(r"[A-z]+", sOut)
	#print (re.findall(u"[ア-ヺ]+", sOut))
	bForeign = len(re.findall(u'[A-zア-ヺ]+', sOut)) > 1
	if bForeign:
	print(sDate + "\|http://www.japanese-cinema-db.jp/Details?id=" + str(a) + "\|" + sTitle + "\|" + re.sub(r"\s+", " ", sOut))
	#print (sOut)
	#break
	# Getting everything
	python3 getter.py

	# Processing
	python3 process.py \| grep 201 > foreign_2010.txt