Skip to content

Instantly share code, notes, and snippets.

@anmsid
Created December 23, 2013 23:48
Show Gist options
  • Save anmsid/8106752 to your computer and use it in GitHub Desktop.
Save anmsid/8106752 to your computer and use it in GitHub Desktop.
Scrape member profile details from jkt48.com website and store it into json format.
#!/usr/bin/env python
import urllib2
import re
import json
"""
Scrape member profile details from jkt48.com website and store it into json format.
Author: anmsid[at]gmail.com
"""
def get_list_id(url):
html = urllib2.urlopen(url).read()
p = re.compile(r'class="profilename">.+?<a href="/member/detail/id/(\d+?)\?',re.M|re.S)
return p.findall(html)
def get_detail(url):
html = urllib2.urlopen(url).read()
#regex pattern for profile detail
pbio = re.compile(r'<div class="photo">.+?<img src="/profile/(?P<ppic>.+?)" alt.+?<span itemprop="name">(?P<name>.+?)</span>.+?<time itemprop="birthday".+?>(?P<birthday>.+?)</time>.+?<span itemprop="role">(?P<bloodtype>.+?)</span>.+?<span itemprop="org">(?P<horoscope>.+?)</span>.+?Tinggi Badan.+?"bioright">(?P<height>.+?)</div>.+?"nickname">(?P<nickname>.+?)</span>',re.M|re.S)
mdict = [m.groupdict() for m in pbio.finditer(html)][0]
return mdict
if __name__ == "__main__":
list_profiles = []
json_dict = {}
list_id = get_list_id('http://jkt48.com/member/list?lang=id')
print "There are %s url to be parsed for detailed info" %len(list_id)
#loop through list of member
for id in list_id:
url = 'http://jkt48.com/member/detail/id/%s?lang=id' %id
print "Parsing %s " %url
detail = get_detail(url)
list_profiles.append(detail)
json_dict['profiles'] = list_profiles
print "Writing to parsed48.json..."
f = open('parsed48.json', 'w+')
json.dump(json_dict, f)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment