Skip to content

Instantly share code, notes, and snippets.

@dtynn
Created June 1, 2014 14:35
Show Gist options
  • Save dtynn/2b7e6269eb34f9ab43b9 to your computer and use it in GitHub Desktop.
Save dtynn/2b7e6269eb34f9ab43b9 to your computer and use it in GitHub Desktop.
#coding=utf-8
from bs4 import BeautifulSoup
import urllib2
request_header = dict()
request_header['User-Agent'] = "kanjianbox spider"
url = "http://www.kanjian.com/"
request = urllib2.Request(url, None, request_header)
req = urllib2.urlopen(request)
doc = req.read()
soup = BeautifulSoup(doc, "lxml")
recommend_list = soup.select("div#recommendArtist .bd ul li.item")
result = []
for item in recommend_list:
user = dict()
avatar_tag = item.select("a.userAvatar")[0]
page_url = avatar_tag["href"]
uid = page_url.rstrip("/").split("/")[-1]
user["uid"] = int(uid)
avatar_img_tag = avatar_tag.select("img")
user["avatarUrl"] = avatar_img_tag[0]["src"]
user["name"] = avatar_img_tag[0]["alt"]
style_a_tags = item.select("p.style a")
styles = [x.text for x in style_a_tags]
user["styles"] = "/".join(styles)
result.append(user)
print result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment