Skip to content

Instantly share code, notes, and snippets.

@bryanyang0528
Last active August 29, 2015 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save bryanyang0528/8bf8a31e6ef67c3118fa to your computer and use it in GitHub Desktop.
Save bryanyang0528/8bf8a31e6ef67c3118fa to your computer and use it in GitHub Desktop.
Blog Crawler
{
"metadata": {
"name": "",
"signature": "sha256:d0e70bba76e533c765aa57c9511a49d143a281d7ab469d94c5c12b1f34eb3f77"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Import \u6240\u9700\u8981\u7684\u5957\u4ef6"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import requests\n",
"from BeautifulSoup import BeautifulSoup\n",
"import HTMLParser\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "raw",
"metadata": {},
"source": []
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"\u7372\u53d6\u7db2\u9801\u8cc7\u8a0a"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"res = requests.get(\"http://bryannotes.blogspot.tw/\")\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"\u8f49\u6210SOUP\u7269\u4ef6"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"soup = BeautifulSoup(res.text.encode(\"utf-8\"))\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"\u7528\u95dc\u9375TAG\u627e\u9023\u7d50(\u5148\u7528\u4e00\u7b46\u8cc7\u6599\u6e2c\u8a66)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"bid_table = soup.findAll('h3',{'class':'post-title entry-title'})\n",
"\n",
"print bid_table[1].findAll('a',{'href':True})"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[<a href=\"http://bryannotes.blogspot.tw/2014/06/python.html\">[Python] \u57fa\u672c\u8a9e\u6cd5\u4ecb\u7d39&#12289;\u6559\u5b78\u8207\u7c21\u55ae\u7bc4\u4f8b</a>]\n"
]
}
],
"prompt_number": 13
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"\u6293\u9023\u7d50"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"bid_file = open(\"blog_links.txt\",'w')\n",
"\n",
"for link in bid_table:\n",
" links = str([tag['href'] for tag in link.findAll('a',{'href':True})])[3:-2]\n",
" bid_file.write(links+\"\\n\")\n",
" print links\n",
"bid_file.close()\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"http://bryannotes.blogspot.tw/2014/06/python-list-dictionary.html\n",
"http://bryannotes.blogspot.tw/2014/06/python.html\n",
"http://bryannotes.blogspot.tw/2014/06/python-python.html\n",
"http://bryannotes.blogspot.tw/2014/06/python-crawler-blog.html\n",
"http://bryannotes.blogspot.tw/2014/06/rreshapetranspose.html\n",
"http://bryannotes.blogspot.tw/2014/06/data-six-ways-to-make-your-data-more.html\n",
"http://bryannotes.blogspot.tw/2014/06/r-applysapplylapply.html\n",
"http://bryannotes.blogspot.tw/2014/06/30.html\n",
"http://bryannotes.blogspot.tw/2014/06/blog-post_11.html\n",
"http://bryannotes.blogspot.tw/2014/06/blog-post_5007.html\n",
"http://bryannotes.blogspot.tw/2014/06/r-text-mining.html\n",
"http://bryannotes.blogspot.tw/2014/06/blog-post.html\n",
"http://bryannotes.blogspot.tw/2014/06/rrecode-data-by-percentile.html\n",
"http://bryannotes.blogspot.tw/2014/05/r_15.html\n",
"http://bryannotes.blogspot.tw/2014/05/r_8.html\n",
"http://bryannotes.blogspot.tw/2014/05/2013hot.html\n",
"http://bryannotes.blogspot.tw/2014/05/r.html\n",
"http://bryannotes.blogspot.tw/2014/04/blog-post.html\n",
"http://bryannotes.blogspot.tw/2014/04/r.html\n",
"http://bryannotes.blogspot.tw/2014/03/blog-post_19.html\n"
]
}
],
"prompt_number": 14
},
{
"cell_type": "raw",
"metadata": {},
"source": []
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"\u7167\u8457\u6293\u4e0b\u4f86\u7684\u9023\u7d50\uff0c\u5206\u5225\u6293\u6bcf\u500b\u9023\u7d50\u7684\u5167\u5bb9"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"bid_list = open('blog_links.txt','r')\n",
"h = HTMLParser.HTMLParser()\n",
"blog = {}\n",
"for line in bid_list.readlines():\n",
" pagelink = line.strip()\n",
" request_get = requests.get(pagelink)\n",
" soup_post = BeautifulSoup(request_get.text.encode(\"utf-8\"))\n",
" body = h.unescape(soup_post.find(\"div\",{'class':'post-body entry-content'}).text)\n",
" title = h.unescape(soup_post.find(\"h3\",{'class':'post-title entry-title'}).text)\n",
" blog[title] = body\n",
" \n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"\u6e2c\u8a66\u4e00\u4e0b\u6709\u6c92\u6709\u6293\u6210\u529f"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for key in blog:\n",
" print key,\n",
" print len(blog[key])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\u5f9e\u53cd\u9ed1\u7bb1\u670d\u8cbf\u5354\u8b70\u770b\u50b3\u7d71\u5a92\u9ad4\u7684\u6c92\u843d 1036\n",
"[Python] \u57fa\u790e\u7bc7\uff1a\u6d41\u7a0b\u63a7\u5236\u3001\u7269\u4ef6\u8207\u65b9\u6cd5\u3001List & Dictionary 285\n",
"[R][\u7ffb\u8b6f] apply\u3001sapply\u3001lapply\u4e4b\u5340\u5225 2363\n",
"\u50b3\u7d71\u5e02\u5834\u5206\u6790\u4eba\u54e1\u7684\u672a\u4f86 727\n",
"[Python] python\u5165\u9580\u4f7f\u7528\u5fc3\u5f97 692\n",
"[R][\u7ffb\u8b6f]Reshape(transpose)! \u8cc7\u6599\u7684\u8b8a\u5f62\u91d1\u525b 1316\n",
"\u8cc7\u6599\u8108\u7d61\u8207\u8a6e\u91cb 936\n",
"[\u8f49\u8cbc] \u8cc7\u6599\u79d1\u5b78\u5bb6\u8207\u51e1\u4eba\u7684\u6e9d\u901a\u5229\u5668\uff1a30 \u500b\u628a\u8cc7\u6599\u8996\u89ba\u5316\u7684\u7c21\u55ae\u5de5\u5177-\u79d1\u6280\u5831\u6a58 227\n",
"\u8cc7\u6599\u7684\u5207\u8207\u4e0d\u5207\uff0cis a critical choice 508\n",
"[R]\u7528R\u8f49\u63db\u8cc7\u6599\u7d50\u69cb-\u5c07\u77e9\u9663\u578b\u8cc7\u6599\u8f49\u70ba\u4e00\u822c\u8cc7\u6599\u683c\u5f0f 593\n",
"[Python] \u73fe\u5b78\u73fe\u8ce3\u4e4b\u7db2\u8def\u722c\u87f2(Crawler)--\u4ee5\u6293\u672cBLOG\u70ba\u4f8b 465\n",
"\u5982\u4f55\u9032\u5165\u5e02\u5834\u8abf\u67e5/\u884c\u92b7\u7814\u7a76\u696d!\u6436\u4f542013\u5168\u7403\u6700HOT\u5de5\u4f5c! 1713\n",
"[R]\u7528R\u5c07\u8cc7\u6599\u4f9d\u767e\u5206\u4f4d\u6578\u5206\u7d44(Recode Data by Percentile) \u9023\u7e8c\u578b\u8cc7\u6599\u8f49\u96e2\u6563 615\n",
"[R]\u7528R\u6293\u7db2\u9801\u8cc7\u6599 460\n",
"[R]\u6700\u8fd1\u6295\u5165\u4e86R\u7684\u4e16\u754c 796\n",
"[R] TEXT MINING(\u6587\u5b57\u63a2\u52d8\u7df4\u7fd2) 692\n",
"[R]R\u7684\u5b78\u7fd2\u8cc7\u6e90\u63a8\u85a6 1214\n",
"[\u7ffb\u8b6f]\u8b93\u4f60\u7684DATA\u66f4\u4eba\u6027 (Six Ways to Make Your Data More Human) 1020\n",
"\u5c08\u6848\u7ba1\u7406\u6280\u5de7\u5206\u4eab 546\n",
"[Python] \u57fa\u672c\u8a9e\u6cd5\u4ecb\u7d39\u3001\u6559\u5b78\u8207\u7c21\u55ae\u7bc4\u4f8b 289\n"
]
}
],
"prompt_number": 8
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"\u5b58\u6210\u6a94\u6848\uff0c\u65e5\u5f8c\u5206\u6790"
]
},
{
"cell_type": "code",
"collapsed": true,
"input": [
"# coding=UTF-8\n",
"f = open(\"C:\\\\blog_text.txt\",\"w\")\n",
"\n",
"for key in blog:\n",
" f.write(key.encode('utf-8')+\",\")\n",
" f.write(blog[key].encode('utf-8')+\"\\n\")\n",
"f.close()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment