b1ca/chinese_pdf.py

## chinese_pdf.py
#coding=utf-8
from __future__ import unicode_literals
from openpyxl import load_workbook
from urllib import urlretrieve
import os


class Line(object):
    def __init__(self, column_name, url):
        self.column_name = column_name
        self.url = url
        self.pdf_name = url.split('/')[-1]
        self.txt_name = url.split('/')[-1].replace('.pdf', '.txt')


def download_pdf(_line):
    urlretrieve(_line.url, _line.pdf_name)


def get_txt_from_pdf(_line):
    cmd = 'python pdf2txt.py -o %s %s' % (_line.txt_name, _line.pdf_name)
    os.system(cmd)
    os.remove(_line.pdf_name)


def remove_trash_from_txt(_line):
    final_line = []
    print _line.column_name
    with_zero = '0'*(2 - len(_line.column_name)) + _line.column_name
    with open(_line.txt_name) as f:
        for l in f:
            m_string = l.decode('utf-8', 'ignore').strip()
            if m_string.startswith(with_zero) and not m_string.isdigit():
                final_line.append(m_string.replace(with_zero, '').strip())
    os.remove(_line.txt_name)
    return '; '.join(final_line)+';'


if __name__ == '__main__':

    dest_filename = 'chinese.xlsx'
    wb = load_workbook(dest_filename)
    ws = wb.get_active_sheet()
    for i in range(2, ws.max_row+1):
        cell_value = ws['B%s' % str(i)].value.split()
        _col_num, _url = cell_value[0], cell_value[1]
        print _col_num, _url
        line = Line(_col_num, _url)
        download_pdf(line)
        get_txt_from_pdf(line)
        ws['C%s' % str(i)] = remove_trash_from_txt(line)

    wb.save(dest_filename)
	#coding=utf-8
	from __future__ import unicode_literals
	from openpyxl import load_workbook
	from urllib import urlretrieve
	import os


	class Line(object):
	def __init__(self, column_name, url):
	self.column_name = column_name
	self.url = url
	self.pdf_name = url.split('/')[-1]
	self.txt_name = url.split('/')[-1].replace('.pdf', '.txt')


	def download_pdf(_line):
	urlretrieve(_line.url, _line.pdf_name)


	def get_txt_from_pdf(_line):
	cmd = 'python pdf2txt.py -o %s %s' % (_line.txt_name, _line.pdf_name)
	os.system(cmd)
	os.remove(_line.pdf_name)


	def remove_trash_from_txt(_line):
	final_line = []
	print _line.column_name
	with_zero = '0'*(2 - len(_line.column_name)) + _line.column_name
	with open(_line.txt_name) as f:
	for l in f:
	m_string = l.decode('utf-8', 'ignore').strip()
	if m_string.startswith(with_zero) and not m_string.isdigit():
	final_line.append(m_string.replace(with_zero, '').strip())
	os.remove(_line.txt_name)
	return '; '.join(final_line)+';'


	if __name__ == '__main__':

	dest_filename = 'chinese.xlsx'
	wb = load_workbook(dest_filename)
	ws = wb.get_active_sheet()
	for i in range(2, ws.max_row+1):
	cell_value = ws['B%s' % str(i)].value.split()
	_col_num, _url = cell_value[0], cell_value[1]
	print _col_num, _url
	line = Line(_col_num, _url)
	download_pdf(line)
	get_txt_from_pdf(line)
	ws['C%s' % str(i)] = remove_trash_from_txt(line)

	wb.save(dest_filename)