Skip to content

Instantly share code, notes, and snippets.

@sinanatra
Created May 31, 2018 15:59
Show Gist options
  • Save sinanatra/cd7c503c9dc8dde5f85041593e1e6eae to your computer and use it in GitHub Desktop.
Save sinanatra/cd7c503c9dc8dde5f85041593e1e6eae to your computer and use it in GitHub Desktop.
import codecs
import urllib, re
import requests
import urllib.request
import datetime
import lxml.html
import os
import time
from bs4 import BeautifulSoup as soup
ID = "Q36180"
if not os.path.exists("img"):
os.makedirs("img")
url = "https://www.wikidata.org/wiki/"+ID
print(url)
html = lxml.html.parse(urllib.request.urlopen(url) )
links = [i.strip() for i in html.xpath("//span[contains(@class, 'wikibase-sitelinkview-page')]/a/@href")]
num = 0
for link in links:
print (link)
response = requests.get(link)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
try:
if url.endswith('jpg'):
load_img = urllib.request.urlretrieve("http:"+url,"img/"+ID+str(num)+".jpg")
elif url.endswith('png'):
load_img = urllib.request.urlretrieve("http:"+test,"url/"+ID+str(num)+".png")
elif test.endswith('svg'):
continue
elif url.endswith('gif'):
load_img = urllib.request.urlretrieve("http:"+test,"url/"+ID+str(num)+".gif")
num +=1
except Exception as e:
#print (e)
continue
print("That's All Folks!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment