Skip to content

Instantly share code, notes, and snippets.

@ChinYikMing
Created December 16, 2022 08:52
Show Gist options
  • Save ChinYikMing/50544229ff9e2f14213ab7d00768960c to your computer and use it in GitHub Desktop.
Save ChinYikMing/50544229ff9e2f14213ab7d00768960c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
import requests
import json
import time
from time import sleep
import pandas as pd
import openpyxl
from openpyxl.styles import Font
import docx
from math import ceil
def rm_ctrl_char(string):
val = string
val = val.replace("\x00", "")
val = val.replace("\x01", "")
val = val.replace("\x02", "")
val = val.replace("\x03", "")
val = val.replace("\x04", "")
val = val.replace("\x05", "")
val = val.replace("\x06", "")
val = val.replace("\x07", "")
val = val.replace("\x08", "")
val = val.replace("\x09", "")
val = val.replace("\x0B", "")
val = val.replace("\x0C", "")
val = val.replace("\x0E", "")
val = val.replace("\x0F", "")
val = val.replace("\x10", "")
val = val.replace("\x11", "")
val = val.replace("\x12", "")
val = val.replace("\x13", "")
val = val.replace("\x14", "")
val = val.replace("\x15", "")
val = val.replace("\x16", "")
val = val.replace("\x17", "")
val = val.replace("\x18", "")
val = val.replace("\x19", "")
val = val.replace("\x1A", "")
val = val.replace("\x1B", "")
val = val.replace("\x1C", "")
val = val.replace("\x1D", "")
val = val.replace("\x1E", "")
val = val.replace("\x1F", "")
return val
start_year = '2019'
start_month = '01'
start_date = '02'
end_year = '2022'
end_month = '12'
end_date = '31'
querySentence = '%25E6%25A2%259D%25E7%25B4%2584%2520OR%2520%25E5%2585%25AC%25E7%25B4%2584%2520OR%2520%25E8%25AD%25B0%25E5%25AE%259A%25E6%259B%25B8%2520OR%2520%25E5%258D%2594%25E5%25AE%259A%2520NOT%2520%25E7%25B4%2584%25E5%25AE%259A%2520NOT%2520%25E5%258D%2594%25E8%25AD%25B0%2520NOT%2520%25E5%2590%2588%25E6%2584%258F'
record_per_req= 5000
total_record_link = 'https://www.lawplus.com.tw/rest/search/report?querySentence=' + querySentence + '&date=' + \
start_year + '%2F' + start_month + '%2F' + start_date + '~' + end_year + '%2F' + end_month + '%2F' + end_date + \
'&courts=TPS%2CTPA%2CTPH%2CTPB%2CTCB%2CTCH%2CKSB%2CTNH%2CKSH%2CHLH%2CKMH&rows=' + str(record_per_req) +'&page=1&_=1664641123174'
res = requests.get(total_record_link)
data = json.loads(res.text)
total_record= data['records']
total_page = ceil(total_record / record_per_req)
print(total_record, total_page)
identifier = []
link = 'https://www.lawplus.com.tw/rest/search/report?querySentence=' + querySentence + '&date=' + \
start_year + '%2F' + start_month + '%2F' + start_date + '~' + end_year + '%2F' + end_month + '%2F' + end_date + \
'&courts=TPS%2CTPA%2CTPH%2CTPB%2CTCB%2CTCH%2CKSB%2CTNH%2CKSH%2CHLH%2CKMH&rows=' + str(record_per_req) +'&page={}&_=1664641123174'
for page_number in range(1, total_page + 1):
res = requests.get(link.format(page_number))
data = json.loads(res.text)
rows = data['rows']
for each in rows:
identifier.append(each['identifier'])
# print(len(identifier))
sleep(1)
workbook = openpyxl.Workbook()
workbook.encoding = 'utf-8'
worksheet = workbook.active
font = Font(name='MingLiU')
worksheet.font = font
worksheet['A1'] = 'judge_date'
worksheet['B1'] = 'issue'
worksheet['C1'] = 'type'
# worksheet['D1'] = 'preside_judge'
worksheet['D1'] = 'appellant'
worksheet['E1'] = 'defendant'
worksheet['F1'] = 'content'
worksheet['G1'] = 'related_statute'
counter = 0
note_link = 'https://www.lawplus.com.tw/rest/search/report/{}'
data = []
for id in identifier:
res_each = requests.get(note_link.format(id)).text
temp = json.loads(res_each)
judge_date = temp['response']['reportBase']['identifier'].split(',')[4]
issue = temp['response']['reportBase']['issue']
type_ = temp['response']['report']['type']
appellant = temp['response']['report']['appellant']
defendant = temp['response']['report']['defendant']
content = temp['response']['reportBase']['content']
related_statute = temp['response']['report']['statute']
document = docx.Document()
content = rm_ctrl_char(content)
issue = rm_ctrl_char(issue)
document.add_paragraph(content)
for paragraph in document.paragraphs:
for run in paragraph.runs:
run.font.name = 'MingLiU'
document.save(str(counter) + '.docx')
worksheet['A' + str(counter + 2)] = judge_date
worksheet['B' + str(counter + 2)] = issue
worksheet['C' + str(counter + 2)] = type_
if appellant is None:
worksheet['D' + str(counter + 2)] = '無'
else:
if isinstance(appellant, list):
if len(appellant) == 0:
worksheet['D' + str(counter + 2)] = '無'
else:
worksheet['D' + str(counter + 2)] = ','.join(appellant)
else:
worksheet['D' + str(counter + 2)] = appellant
if defendant is None:
worksheet['E' + str(counter + 2)] = '無'
else:
if isinstance(defendant, list):
if len(defendant) == 0:
worksheet['E' + str(counter + 2)] = '無'
else:
worksheet['E' + str(counter + 2)] = ','.join(defendant)
else:
worksheet['E' + str(counter + 2)] = defendant
worksheet['F' + str(counter + 2)] = str(counter) + '.docx'
if related_statute is None:
worksheet['G' + str(counter + 2)] = '無'
else:
if isinstance(related_statute, list):
if len(related_statute) == 0:
worksheet['G' + str(counter + 2)] = '無'
else:
worksheet['G' + str(counter + 2)] = ','.join(related_statute)
else:
worksheet['G' + str(counter + 2)] = related_statute
counter += 1
print("In progress,", counter, "done( total =", total_record, ")")
workbook.save('data.xlsx')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment