hondajojo/cjwsw.py

## cjwsw.py
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from pyquery import PyQuery as pq
import requests


def get_full(short_url, page):
    for i in xrange(page):
        if i == 0:
            url = "http://www.court.gov.cn/zgcpwsw/" + short_url + "/index.htm"
        else:
            url = "http://www.court.gov.cn/zgcpwsw/" + short_url + "/index_%s.htm" % i
        res = requests.get(url=url).content
        for each in pq(res)('.bottom_right_con_five_list li').items():
            title = each.find('a').attr.title
            code = each.find('div[style="width:150px; z-index:-1;"] a').attr.title
            date = each.find('span[style="color:#333; z-index:-1;"]').text()
            url = 'http://www.court.gov.cn/zgcpwsw' + each.find('div[style="width:150px; z-index:-1;"] a').attr.href.split('..')[-1]
            pdf_download_url = 'http://www.court.gov.cn/downloadPdf/Download?docId=' + url.split('_')[-1].split('.')[0]
            res2 = requests.get(url).content
            content = pq(res2)('#DocArea').text().replace("//W3C//DTD HTML 4.0 Transitional//EN'> ", "")
            court = pq(res)('#nav > a:nth-last-child(1)').text()
            print title, code, date, url,pdf_download_url, court, content

get_full("mshz", 2)  # 民事案件
get_full("xshz", 2)  # 刑事案件
get_full('xzhz',2)   # 行政案件
get_full('zscqhz', 2)  # 知识产权
get_full('pchz', 2)  # 赔偿案件
get_full('zxhz', 2)  # 执行案件
	#!/usr/bin/env python
	# -- coding:utf-8 --
	from pyquery import PyQuery as pq
	import requests


	def get_full(short_url, page):
	for i in xrange(page):
	if i == 0:
	url = "http://www.court.gov.cn/zgcpwsw/" + short_url + "/index.htm"
	else:
	url = "http://www.court.gov.cn/zgcpwsw/" + short_url + "/index_%s.htm" % i
	res = requests.get(url=url).content
	for each in pq(res)('.bottom_right_con_five_list li').items():
	title = each.find('a').attr.title
	code = each.find('div[style="width:150px; z-index:-1;"] a').attr.title
	date = each.find('span[style="color:#333; z-index:-1;"]').text()
	url = 'http://www.court.gov.cn/zgcpwsw' + each.find('div[style="width:150px; z-index:-1;"] a').attr.href.split('..')[-1]
	pdf_download_url = 'http://www.court.gov.cn/downloadPdf/Download?docId=' + url.split('_')[-1].split('.')[0]
	res2 = requests.get(url).content
	content = pq(res2)('#DocArea').text().replace("//W3C//DTD HTML 4.0 Transitional//EN'> ", "")
	court = pq(res)('#nav > a:nth-last-child(1)').text()
	print title, code, date, url,pdf_download_url, court, content

	get_full("mshz", 2) # 民事案件
	get_full("xshz", 2) # 刑事案件
	get_full('xzhz',2) # 行政案件
	get_full('zscqhz', 2) # 知识产权
	get_full('pchz', 2) # 赔偿案件
	get_full('zxhz', 2) # 执行案件