Skip to content

Instantly share code, notes, and snippets.

@suriyadeepan
Last active July 28, 2017 07:10
Show Gist options
  • Save suriyadeepan/60dcb03e437293b8d8cf6755fe10a3c3 to your computer and use it in GitHub Desktop.
Save suriyadeepan/60dcb03e437293b8d8cf6755fe10a3c3 to your computer and use it in GitHub Desktop.
Scrape from tamilvu.org
'''
Extract images from pdf.
- requires imagemagick and wand
sudo apt install imagemagick
sudo pip3 install --upgrade Wand
'''
from wand.image import Image
import sys
from utils import *
def pdf2im(filepath, resolution=300):
# extract path and "filename"
filename = filepath.split('/')[-1].split('.')[0]
path = '/'.join(filepath.split('/')[:-1]) + '/' + filename + '/'
# create such path
create_folder(path)
with(Image(filename=filepath, resolution=resolution)) as src:
images = src.sequence
pages = len(images)
for i in range(pages):
Image(images[i]).save(filename=path + str(i) + '.png')
if __name__ == '__main__':
# a sample pdf
pdf2im('./tamilvu/music/cpajaneikkiirttneikalhiva.pdf')
beautifulsoup4==4.6.0
bs4==0.0.1
certifi==2017.4.17
chardet==3.0.4
idna==2.5
lxml==3.8.0
requests==2.18.2
urllib3==1.22
Wand==0.4.4
from bs4 import BeautifulSoup
import requests
from utils import *
BASE = 'http://tamilvu.org/library/nationalized/scholars'
seed_urls = [ 'http://tamilvu.org/library/nationalized/scholars/html/music.htm',
'http://tamilvu.org/library/nationalized/scholars/html/literature.htm',
'http://tamilvu.org/library/nationalized/scholars/html/education.htm',
'http://tamilvu.org/library/nationalized/scholars/html/poetry.htm',
'http://tamilvu.org/library/nationalized/scholars/html/law.htm',
'http://tamilvu.org/library/nationalized/scholars/html/sociology.htm',
'http://tamilvu.org/library/nationalized/scholars/html/biography.htm',
'http://tamilvu.org/library/nationalized/scholars/html/drama.htm',
'http://tamilvu.org/library/nationalized/scholars/html/general.htm',
'http://tamilvu.org/library/nationalized/scholars/html/materialscience.htm',
'http://tamilvu.org/library/nationalized/scholars/html/religion.htm',
'http://tamilvu.org/library/nationalized/scholars/html/language.htm',
'http://tamilvu.org/library/nationalized/scholars/html/history.htm',
'http://tamilvu.org/library/nationalized/scholars/html/agriculture.htm',
'http://tamilvu.org/library/nationalized/scholars/html/others.htm'
]
def get_soup(url):
return BeautifulSoup(requests.get(url).content, 'lxml')
def decorate_link(url):
return BASE + url[2:]
def get_links(url):
soup = get_soup(url)
links = []
for link in soup.find_all('a'):
if 'href' in link.attrs:
href = link.get('href')
if 'pdf' in str(href):
links.append(decorate_link(href))
return links
def download_file(url, PATH='./'):
filename = url.split('/')[-1]
# get response handle
response = requests.get(url)
# write to file
with open(PATH + filename, 'wb') as f:
f.write(response.content)
if __name__ == '__main__':
# create a folder to save files
create_folder('./tamilvu')
# iterate through seed urls
links, link2folder = [], {}
n_urls = len(seed_urls)
print(':: Gathering links ::')
for i,url in enumerate(seed_urls):
# get links to pdf's in each url
print(' [{}/{}] {}'.format(i, n_urls, url))
clinks = get_links(url)
# create sub-folder
subfolder = './tamilvu' + '/' + url.split('/')[-1].split('.')[0] + '/'
create_folder(subfolder)
# attach folders to links
for link in clinks:
link2folder[link] = subfolder
# append to list of links
links.extend(clinks)
# save links to a file
# just in case, the python downloader fucks up!
# use wget or aria2
save2file(links)
n_links = len(links)
print(':: Downloading files ::')
# iterate through links
for i,link in enumerate(links):
# download the damn thing
print(' [{}/{}] {}'.format(i, n_links, link))
download_file(link, link2folder[link])
import os
def save2file(items, filename='items.list'):
with open(filename, 'w') as f:
for item in items:
f.write(str(item) + '\n')
def create_folder(name):
if not os.path.exists(name):
os.makedirs(name)
@demonshreder
Copy link

demonshreder commented Jul 27, 2017

py3 requirements

beautifulsoup4==4.6.0
bs4==0.0.1
certifi==2017.4.17
chardet==3.0.4
idna==2.5
lxml==3.8.0
requests==2.18.2
urllib3==1.22
Wand==0.4.4

@suriyadeepan
Copy link
Author

👍

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment