Skip to content

Instantly share code, notes, and snippets.

@lantip
Created January 4, 2016 14:43
Show Gist options
  • Save lantip/b2fa24b111e59d41806b to your computer and use it in GitHub Desktop.
Save lantip/b2fa24b111e59d41806b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import json
from bs4 import BeautifulSoup
def download_file(url):
local_filename = url.split('/')[-1]
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
return local_filename
# mari kita koding
main_url = 'https://en.wikipedia.org/wiki/User_talk:Crisco_1492/Library'
# download the page
r = requests.get(main_url)
soup = BeautifulSoup(r.text,'html.parser')
listfile = []
# lets start parsing
# first we'll collect all the div with class 'thumb'
thumbs = soup.find_all('div',attrs={'class':'thumb'})
#iterate the thumbs
for t in thumbs:
# find the link inside the div
link = t.find_all('a')
# find the image. fck, haruse ndak perlu yang di atas, tapi yo ben lah
thu = link[0].find_all('img')
# get attribute srcset
set = thu[0]['srcset']
# get the file link
file = set.split('.pdf/')[0].replace('thumb/','')+'.pdf'
listfile.append(file)
for l in listfile:
namfile = l.split('/')[-1]
print 'downloading '+ namfile
download_file('http:'+l)
@lantip
Copy link
Author

lantip commented Jan 4, 2016

untuk pemakaian:
simpan script ini sebagai wiki_downloader.py
install requests
install beautifulsoup4

jalankan: python wiki_downloader.py

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment