suriyadeepan/pdf2ims.py

## pdf2ims.py
'''
    Extract images from pdf.

    - requires imagemagick and wand

      sudo apt install imagemagick
      sudo pip3 install --upgrade Wand

'''
from wand.image import Image
import sys

from utils import *


def pdf2im(filepath, resolution=300):

    # extract path and "filename"
    filename = filepath.split('/')[-1].split('.')[0]
    path = '/'.join(filepath.split('/')[:-1]) + '/' + filename + '/'

    # create such path
    create_folder(path)

    with(Image(filename=filepath, resolution=resolution)) as src:
        images = src.sequence
        pages = len(images)
        for i in range(pages):
            Image(images[i]).save(filename=path + str(i) + '.png')


if __name__ == '__main__':
    # a sample pdf
    pdf2im('./tamilvu/music/cpajaneikkiirttneikalhiva.pdf')


## requirements.txt
beautifulsoup4==4.6.0
bs4==0.0.1
certifi==2017.4.17
chardet==3.0.4
idna==2.5
lxml==3.8.0
requests==2.18.2
urllib3==1.22
Wand==0.4.4

## tamilvu.py
from bs4 import BeautifulSoup
import requests

from utils import *

BASE = 'http://tamilvu.org/library/nationalized/scholars'

seed_urls = [ 'http://tamilvu.org/library/nationalized/scholars/html/music.htm',
        'http://tamilvu.org/library/nationalized/scholars/html/literature.htm',
        'http://tamilvu.org/library/nationalized/scholars/html/education.htm',
        'http://tamilvu.org/library/nationalized/scholars/html/poetry.htm',
        'http://tamilvu.org/library/nationalized/scholars/html/law.htm',
        'http://tamilvu.org/library/nationalized/scholars/html/sociology.htm',
        'http://tamilvu.org/library/nationalized/scholars/html/biography.htm',
        'http://tamilvu.org/library/nationalized/scholars/html/drama.htm',
        'http://tamilvu.org/library/nationalized/scholars/html/general.htm',
        'http://tamilvu.org/library/nationalized/scholars/html/materialscience.htm',
        'http://tamilvu.org/library/nationalized/scholars/html/religion.htm',
        'http://tamilvu.org/library/nationalized/scholars/html/language.htm',
        'http://tamilvu.org/library/nationalized/scholars/html/history.htm',
        'http://tamilvu.org/library/nationalized/scholars/html/agriculture.htm',
        'http://tamilvu.org/library/nationalized/scholars/html/others.htm'
        ]


def get_soup(url):
    return BeautifulSoup(requests.get(url).content, 'lxml')

def decorate_link(url):
    return BASE + url[2:]

def get_links(url):
    soup = get_soup(url)
    links = []
    for link in soup.find_all('a'):
        if 'href' in link.attrs:
            href = link.get('href')
            if 'pdf' in str(href):
                links.append(decorate_link(href))

    return links

def download_file(url, PATH='./'):
    filename = url.split('/')[-1]
    # get response handle
    response = requests.get(url)
    # write to file
    with open(PATH + filename, 'wb') as f:
        f.write(response.content)

if __name__ == '__main__':
    # create a folder to save files
    create_folder('./tamilvu')

    # iterate through seed urls
    links, link2folder = [], {}
    n_urls = len(seed_urls)
    print(':: Gathering links ::')
    for i,url in enumerate(seed_urls):
        # get links to pdf's in each url
        print(' [{}/{}] {}'.format(i, n_urls, url))
        clinks = get_links(url)

        # create sub-folder
        subfolder = './tamilvu' + '/' + url.split('/')[-1].split('.')[0] + '/'
        create_folder(subfolder)

        # attach folders to links
        for link in clinks:
            link2folder[link] = subfolder

        # append to list of links
        links.extend(clinks)

    # save links to a file
    #  just in case, the python downloader fucks up!
    #   use wget or aria2
    save2file(links)

    n_links = len(links)
    print(':: Downloading files ::')
    # iterate through links
    for i,link in enumerate(links):
        # download the damn thing
        print(' [{}/{}] {}'.format(i, n_links, link))
        download_file(link, link2folder[link])

## utils.py
import os

def save2file(items, filename='items.list'):
    with open(filename, 'w') as f:
        for item in items:
            f.write(str(item) + '\n')

def create_folder(name):
    if not os.path.exists(name):
        os.makedirs(name)
	'''
	Extract images from pdf.

	- requires imagemagick and wand

	sudo apt install imagemagick
	sudo pip3 install --upgrade Wand

	'''
	from wand.image import Image
	import sys

	from utils import *


	def pdf2im(filepath, resolution=300):

	# extract path and "filename"
	filename = filepath.split('/')[-1].split('.')[0]
	path = '/'.join(filepath.split('/')[:-1]) + '/' + filename + '/'

	# create such path
	create_folder(path)

	with(Image(filename=filepath, resolution=resolution)) as src:
	images = src.sequence
	pages = len(images)
	for i in range(pages):
	Image(images[i]).save(filename=path + str(i) + '.png')


	if __name__ == '__main__':
	# a sample pdf
	pdf2im('./tamilvu/music/cpajaneikkiirttneikalhiva.pdf')
	beautifulsoup4==4.6.0
	bs4==0.0.1
	certifi==2017.4.17
	chardet==3.0.4
	idna==2.5
	lxml==3.8.0
	requests==2.18.2
	urllib3==1.22
	Wand==0.4.4
	from bs4 import BeautifulSoup
	import requests

	from utils import *

	BASE = 'http://tamilvu.org/library/nationalized/scholars'

	seed_urls = [ 'http://tamilvu.org/library/nationalized/scholars/html/music.htm',
	'http://tamilvu.org/library/nationalized/scholars/html/literature.htm',
	'http://tamilvu.org/library/nationalized/scholars/html/education.htm',
	'http://tamilvu.org/library/nationalized/scholars/html/poetry.htm',
	'http://tamilvu.org/library/nationalized/scholars/html/law.htm',
	'http://tamilvu.org/library/nationalized/scholars/html/sociology.htm',
	'http://tamilvu.org/library/nationalized/scholars/html/biography.htm',
	'http://tamilvu.org/library/nationalized/scholars/html/drama.htm',
	'http://tamilvu.org/library/nationalized/scholars/html/general.htm',
	'http://tamilvu.org/library/nationalized/scholars/html/materialscience.htm',
	'http://tamilvu.org/library/nationalized/scholars/html/religion.htm',
	'http://tamilvu.org/library/nationalized/scholars/html/language.htm',
	'http://tamilvu.org/library/nationalized/scholars/html/history.htm',
	'http://tamilvu.org/library/nationalized/scholars/html/agriculture.htm',
	'http://tamilvu.org/library/nationalized/scholars/html/others.htm'
	]


	def get_soup(url):
	return BeautifulSoup(requests.get(url).content, 'lxml')

	def decorate_link(url):
	return BASE + url[2:]

	def get_links(url):
	soup = get_soup(url)
	links = []
	for link in soup.find_all('a'):
	if 'href' in link.attrs:
	href = link.get('href')
	if 'pdf' in str(href):
	links.append(decorate_link(href))

	return links

	def download_file(url, PATH='./'):
	filename = url.split('/')[-1]
	# get response handle
	response = requests.get(url)
	# write to file
	with open(PATH + filename, 'wb') as f:
	f.write(response.content)

	if __name__ == '__main__':
	# create a folder to save files
	create_folder('./tamilvu')

	# iterate through seed urls
	links, link2folder = [], {}
	n_urls = len(seed_urls)
	print(':: Gathering links ::')
	for i,url in enumerate(seed_urls):
	# get links to pdf's in each url
	print(' [{}/{}] {}'.format(i, n_urls, url))
	clinks = get_links(url)

	# create sub-folder
	subfolder = './tamilvu' + '/' + url.split('/')[-1].split('.')[0] + '/'
	create_folder(subfolder)

	# attach folders to links
	for link in clinks:
	link2folder[link] = subfolder

	# append to list of links
	links.extend(clinks)

	# save links to a file
	# just in case, the python downloader fucks up!
	# use wget or aria2
	save2file(links)

	n_links = len(links)
	print(':: Downloading files ::')
	# iterate through links
	for i,link in enumerate(links):
	# download the damn thing
	print(' [{}/{}] {}'.format(i, n_links, link))
	download_file(link, link2folder[link])
	import os

	def save2file(items, filename='items.list'):
	with open(filename, 'w') as f:
	for item in items:
	f.write(str(item) + '\n')

	def create_folder(name):
	if not os.path.exists(name):
	os.makedirs(name)