Skip to content

Instantly share code, notes, and snippets.

@b1ca
Created November 21, 2014 11:59
Show Gist options
  • Save b1ca/66df87f8a92e5f72b2ef to your computer and use it in GitHub Desktop.
Save b1ca/66df87f8a92e5f72b2ef to your computer and use it in GitHub Desktop.
#coding=utf-8
from __future__ import unicode_literals
from openpyxl import load_workbook
from urllib import urlretrieve
import os
class Line(object):
def __init__(self, column_name, url):
self.column_name = column_name
self.url = url
self.pdf_name = url.split('/')[-1]
self.txt_name = url.split('/')[-1].replace('.pdf', '.txt')
def download_pdf(_line):
urlretrieve(_line.url, _line.pdf_name)
def get_txt_from_pdf(_line):
cmd = 'python pdf2txt.py -o %s %s' % (_line.txt_name, _line.pdf_name)
os.system(cmd)
os.remove(_line.pdf_name)
def remove_trash_from_txt(_line):
final_line = []
print _line.column_name
with_zero = '0'*(2 - len(_line.column_name)) + _line.column_name
with open(_line.txt_name) as f:
for l in f:
m_string = l.decode('utf-8', 'ignore').strip()
if m_string.startswith(with_zero) and not m_string.isdigit():
final_line.append(m_string.replace(with_zero, '').strip())
os.remove(_line.txt_name)
return '; '.join(final_line)+';'
if __name__ == '__main__':
dest_filename = 'chinese.xlsx'
wb = load_workbook(dest_filename)
ws = wb.get_active_sheet()
for i in range(2, ws.max_row+1):
cell_value = ws['B%s' % str(i)].value.split()
_col_num, _url = cell_value[0], cell_value[1]
print _col_num, _url
line = Line(_col_num, _url)
download_pdf(line)
get_txt_from_pdf(line)
ws['C%s' % str(i)] = remove_trash_from_txt(line)
wb.save(dest_filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment