Skip to content

Instantly share code, notes, and snippets.

@sinanatra
Created May 11, 2018 16:36
Show Gist options
  • Save sinanatra/d08eaba89d00e1efc628a5bbdcd88b26 to your computer and use it in GitHub Desktop.
Save sinanatra/d08eaba89d00e1efc628a5bbdcd88b26 to your computer and use it in GitHub Desktop.
import wikipedia
import io
import urllib.request
from PIL import Image
import time
languages = [ "aa", "ab", "ady" , "ady-cyrl" , "aeb" , "af" ,
"ak" , "aln" , "als" , "am" , "an" , "ang" , "anp" , "ar" , "arc" , "arn" , "arq" , "ary" , "arz" , "as" , "ase" , "ast" , "atj" , "av" , "avk" , "awa" ,"ay" , "az" , "azb" ,"ba" , "ban" , "bar" , "bat-smg" , "bbc" "bbc-latn" , "bcc" ,"bcl" , "be" , "be-tarask" , "bgn" ,"bh" , "bho" , "bi" , "bjn" , "bm" , "bn" , "bo" ,"bpy" , "bqi" , "br" , "brh" , "bs" ,"bto" , "bug" , "bxr" , "ca" , "cbk-zam" , "cdo" , "ce" , "ceb" ,"ch" , "cho" , "chr" ,"chy" , "ckb" ,"co" , "cps" , "cr" , "crh" ,"crh-latn" , "cs" , "csb" , "cu" , "cv" , "cy" , "da" , "de" , "de-at" , "de-ch" , "de-formal" , "din" , "diq" , "dsb" ,"dtp" , "dty" , "dv" ,"dz" , "ee" ,"egl" ,"el" , "eml" ,"en" , "en-ca" , "en-gb" ,"eo" , "es" , "et" , "eu" ,"ext" , "fa" , "ff" , "fi" , "fit" , "fiu-vro" , "fj" , "fo" , "fr" , "frc" , "frp" , "frr" , "fur" ,"fy" , "ga" , "gag" ,"gan" , "gan-hans" , "gan-hant" , "gcr" , "gd" ,"gl" , "glk" , "gn" , "gom" , "gom-deva" , "gom-latn" , "gor" ,"got" , "grc" , "gsw" , "gu" , "gv" , "ha" , "hak" , "haw" , "he" , "hi" ,"hif" , "hif-latn" ,"hil" , "ho" , "hr" ,"hrx" ,"hsb" ,"ht" , "hu" , "hy" , "hz" , "ia" , "id" , "ie" , "ig" , "ii" ,"ik" , "ike-cans" ,"ike-latn" , "ilo" , "inh" , "io" , "is" , "it" , "iu" , "ja" ,"jam" , "jbo" , "jut" , "jv" ,"ka" , "kaa" , "kab" , "kbd" , "kbd-cyrl" , "kbp" ,"kg" , "khw" , "ki" , "kiu" , "kj" ,"kk" , "kk-arab" ,"kk-cn" ,"kk-cyrl" , "kk-kz" ,"kk-latn" , "kk-tr" , "kl" , "km" ,"kn" ,"ko" , "ko-kp" , "koi" , "kr" , "krc" ,"kri" , "krj" , "krl" , "ks" , "ks-arab" ,"ks-deva" , "ksh" , "ku" , "ku-arab" ,"ku-latn" ,"kum" , "kv" , "kw" , "ky" , "la" ,"lad" , "lb" , "lbe" , "lez" , "lfn" ,"lg" , "li" ,"lij" , "liv" , "lki" ,"lmo" ,"ln" ,"lo" , "loz" , "lrc" , "lt" , "ltg" , "lus" , "luz" , "lv" , "lzh" , "lzz" ,"mai" , "map-bms" , "mdf" , "mg" ,"mh" , "mhr" ,"mi" , "min" , "mk" , "ml" ,"mn" , "mo" , "mr" ,"mrj" , "ms" , "mt" , "mus" , "mwl" ,"my" , "myv" , "mzn" , "na" , "nah" ,"nan" , "nap" , "nb" , "nds" ,"nds-nl" ,"ne" ,"new" , "ng" ,"niu" ,"nl" , "nl-informal" , "nn" ,"no" , "nov" ,"nrm" ,"nso" , "nv" , "ny" ,"nys" ,"oc" ,"olo" ,"om" ,"or" , "os" ,"pa" ,"pag" , "pam" ,"pap" , "pcd" ,"pdc" ,"pdt" , "pfl" , "pi" , "pih" , "pl" , "pms" , "pnb" ,"pnt" ,"prg" , "ps" , "pt" , "pt-br" , "qu" , "qug" ,"rgn" , "rif" ,"rm" , "rmy" , "rn" , "ro" ,"roa-rup" ,"roa-tara" , "ru" , "rue" , "rup" , "ruq" , "ruq-cyrl" , "ruq-latn" ,"rw" , "sa" , "sah" , "sat" , "sc" ,"scn" , "sco" , "sd" , "sdc" , "sdh" ,"se" ,"sei" , "ses" ,"sg" ,"sgs" , "sh" ,"shi" , "shi-latn" ,"shi-tfng" ,"shn" , "si" , "simple" ,"sk" ,"skr" , "skr-arab" ,"sl" , "sli" , "sm" ,"sma" ,"sn" ,"so" , "sq" ,"sr" , "sr-ec" , "sr-el" ,"srn" ,"ss" , "st" ,"stq" , "sty" ,"su" , "sv" , "sw" ,"szl" , "ta" , "tay" ,"tcy", "te" , "tet" ,"tg" ,"tg-cyrl" ,"tg-latn" , "th" ,"ti" , "tk" ,"tl" ,"tly" ,"tn" , "to" , "tpi" , "tr" ,"tru" ,"ts" , "tt" , "tt-cyrl" ,"tt-latn" , "tum" , "tw" , "ty" ,"tyv" , "tzm" ,"udm" ,"ug" , "ug-arab" , "ug-latn" ,"uk" , "ur" , "uz" ,"uz-cyrl" , "uz-latn" , "ve" ,"vec" , "vep" , "vi" ,"vls" ,"vmf" , "vo" ,"vot" ,"vro" ,"wa" , "war" , "wo" , "wuu" , "xal" , "xh" ,"xmf" ,"yi" , "yo" , "yue" , "za" , "zea" ,"zh" , "zh-classical" , "zh-cn" , "zh-hans" , "zh-hant" , "zh-hk" ,
"zh-min-nan" ,"zh-mo" ,"zh-my" ,"zh-sg" ,"zh-tw" ,"zh-yue" ,"zu"]
num = 0
PAGES = ['writer','Journalist','architect']
PAGES = ['writer']
for lan in languages:
try:
print(lan)
wikipedia.set_lang(lan)
for page in PAGES:
wikipage = wikipedia.page(page, auto_suggest=False,redirect=True )
print ("Page Title: %s" , wikipage.title)
print ("Page URL: %s" % wikipage.url)
images = wikipage.images
for i in images:
if i.endswith('jpg'):
load_img = urllib.request.urlretrieve(i,"img/"+page+str(num)+".jpg")
elif i.endswith('png'):
load_img = urllib.request.urlretrieve(i,"img/"+page+str(num)+".png")
elif i.endswith('svg'):
continue
elif i.endswith('gif'):
load_img = urllib.request.urlretrieve(i,"img/"+page+str(num)+".gif")
num +=1
time.sleep(.5)
except Exception as e:
print(e)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment