anmsid/scrap48.py

## scrap48.py
#!/usr/bin/env python
import urllib2
import re
import json

"""
Scrape member profile details from jkt48.com website and store it into json format.
Author: anmsid[at]gmail.com
"""


def get_list_id(url):
	html = urllib2.urlopen(url).read()
	p = re.compile(r'class="profilename">.+?<a href="/member/detail/id/(\d+?)\?',re.M|re.S)
	return p.findall(html)

def get_detail(url):
	html = urllib2.urlopen(url).read()

	#regex pattern for profile detail
	pbio = re.compile(r'<div class="photo">.+?<img src="/profile/(?P<ppic>.+?)" alt.+?<span itemprop="name">(?P<name>.+?)</span>.+?<time itemprop="birthday".+?>(?P<birthday>.+?)</time>.+?<span itemprop="role">(?P<bloodtype>.+?)</span>.+?<span itemprop="org">(?P<horoscope>.+?)</span>.+?Tinggi Badan.+?"bioright">(?P<height>.+?)</div>.+?"nickname">(?P<nickname>.+?)</span>',re.M|re.S)

	mdict =  [m.groupdict() for m in pbio.finditer(html)][0]

	return mdict

if __name__ == "__main__":
	list_profiles = []
	json_dict = {}
	list_id = get_list_id('http://jkt48.com/member/list?lang=id')
	print "There are %s url to be parsed for detailed info" %len(list_id)

	#loop through list of member
	for id in list_id:
		url = 'http://jkt48.com/member/detail/id/%s?lang=id' %id
		print "Parsing %s " %url
		detail = get_detail(url)
		list_profiles.append(detail)

	json_dict['profiles'] = list_profiles

	print "Writing to parsed48.json..."
	f = open('parsed48.json', 'w+')
	json.dump(json_dict, f)
	f.close()
	#!/usr/bin/env python
	import urllib2
	import re
	import json

	"""
	Scrape member profile details from jkt48.com website and store it into json format.
	Author: anmsid[at]gmail.com
	"""



	def get_list_id(url):
	html = urllib2.urlopen(url).read()
	p = re.compile(r'class="profilename">.+?<a href="/member/detail/id/(\d+?)\?',re.M\|re.S)
	return p.findall(html)

	def get_detail(url):
	html = urllib2.urlopen(url).read()

	#regex pattern for profile detail
	pbio = re.compile(r'<div class="photo">.+?<img src="/profile/(?P<ppic>.+?)" alt.+?<span itemprop="name">(?P<name>.+?)</span>.+?<time itemprop="birthday".+?>(?P<birthday>.+?)</time>.+?<span itemprop="role">(?P<bloodtype>.+?)</span>.+?<span itemprop="org">(?P<horoscope>.+?)</span>.+?Tinggi Badan.+?"bioright">(?P<height>.+?)</div>.+?"nickname">(?P<nickname>.+?)</span>',re.M\|re.S)

	mdict = [m.groupdict() for m in pbio.finditer(html)][0]

	return mdict

	if __name__ == "__main__":
	list_profiles = []
	json_dict = {}
	list_id = get_list_id('http://jkt48.com/member/list?lang=id')
	print "There are %s url to be parsed for detailed info" %len(list_id)

	#loop through list of member
	for id in list_id:
	url = 'http://jkt48.com/member/detail/id/%s?lang=id' %id
	print "Parsing %s " %url
	detail = get_detail(url)
	list_profiles.append(detail)

	json_dict['profiles'] = list_profiles

	print "Writing to parsed48.json..."
	f = open('parsed48.json', 'w+')
	json.dump(json_dict, f)
	f.close()