Skip to content

Instantly share code, notes, and snippets.

@hondajojo
Created June 27, 2015 12:26
Show Gist options
  • Save hondajojo/aced324d9ed5e47611f8 to your computer and use it in GitHub Desktop.
Save hondajojo/aced324d9ed5e47611f8 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from pyquery import PyQuery as pq
import requests
def get_full(short_url, page):
for i in xrange(page):
if i == 0:
url = "http://www.court.gov.cn/zgcpwsw/" + short_url + "/index.htm"
else:
url = "http://www.court.gov.cn/zgcpwsw/" + short_url + "/index_%s.htm" % i
res = requests.get(url=url).content
for each in pq(res)('.bottom_right_con_five_list li').items():
title = each.find('a').attr.title
code = each.find('div[style="width:150px; z-index:-1;"] a').attr.title
date = each.find('span[style="color:#333; z-index:-1;"]').text()
url = 'http://www.court.gov.cn/zgcpwsw' + each.find('div[style="width:150px; z-index:-1;"] a').attr.href.split('..')[-1]
pdf_download_url = 'http://www.court.gov.cn/downloadPdf/Download?docId=' + url.split('_')[-1].split('.')[0]
res2 = requests.get(url).content
content = pq(res2)('#DocArea').text().replace("//W3C//DTD HTML 4.0 Transitional//EN'> ", "")
court = pq(res)('#nav > a:nth-last-child(1)').text()
print title, code, date, url,pdf_download_url, court, content
get_full("mshz", 2) # 民事案件
get_full("xshz", 2) # 刑事案件
get_full('xzhz',2) # 行政案件
get_full('zscqhz', 2) # 知识产权
get_full('pchz', 2) # 赔偿案件
get_full('zxhz', 2) # 执行案件
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment