lantip/wiki_downloader

## wiki_downloader
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import json
from bs4 import BeautifulSoup

def download_file(url):
    local_filename = url.split('/')[-1]
    r = requests.get(url, stream=True)
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
    return local_filename

# mari kita koding
main_url = 'https://en.wikipedia.org/wiki/User_talk:Crisco_1492/Library'

# download the page
r = requests.get(main_url)
soup = BeautifulSoup(r.text,'html.parser')
listfile = []

# lets start parsing
# first we'll collect all the div with class 'thumb'
thumbs = soup.find_all('div',attrs={'class':'thumb'})

#iterate the thumbs
for t in thumbs:
    # find the link inside the div
    link = t.find_all('a')
    # find the image. fck, haruse ndak perlu yang di atas, tapi yo ben lah
    thu = link[0].find_all('img')
    # get attribute srcset
    set = thu[0]['srcset']
    # get the file link
    file = set.split('.pdf/')[0].replace('thumb/','')+'.pdf'
    listfile.append(file)

for l in listfile:
    namfile = l.split('/')[-1]
    print 'downloading '+ namfile
    download_file('http:'+l)
	#!/usr/bin/env python
	# -- coding:utf-8 --
	import requests
	import json
	from bs4 import BeautifulSoup

	def download_file(url):
	local_filename = url.split('/')[-1]
	r = requests.get(url, stream=True)
	with open(local_filename, 'wb') as f:
	for chunk in r.iter_content(chunk_size=1024):
	if chunk:
	f.write(chunk)
	return local_filename

	# mari kita koding
	main_url = 'https://en.wikipedia.org/wiki/User_talk:Crisco_1492/Library'

	# download the page
	r = requests.get(main_url)
	soup = BeautifulSoup(r.text,'html.parser')
	listfile = []

	# lets start parsing
	# first we'll collect all the div with class 'thumb'
	thumbs = soup.find_all('div',attrs={'class':'thumb'})

	#iterate the thumbs
	for t in thumbs:
	# find the link inside the div
	link = t.find_all('a')
	# find the image. fck, haruse ndak perlu yang di atas, tapi yo ben lah
	thu = link[0].find_all('img')
	# get attribute srcset
	set = thu[0]['srcset']
	# get the file link
	file = set.split('.pdf/')[0].replace('thumb/','')+'.pdf'
	listfile.append(file)

	for l in listfile:
	namfile = l.split('/')[-1]
	print 'downloading '+ namfile
	download_file('http:'+l)