Skip to content

Instantly share code, notes, and snippets.

@mayhewsw
Last active February 18, 2016 23:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mayhewsw/1600aeade3693db38195 to your computer and use it in GitHub Desktop.
Save mayhewsw/1600aeade3693db38195 to your computer and use it in GitHub Desktop.
Scrape script information from scriptsource
#!/usr/bin/python
from bs4 import BeautifulSoup
# use url: http://unicode.org/iso15924/iso15924-codes.html
with open("iso15924list.html") as f:
html_doc = f.read()
soup = BeautifulSoup(html_doc, 'html.parser')
for tr in soup.find_all("tr"):
print tr.td.text.encode("utf8")
#!/usr/bin/python
import re
import urllib2
import os.path
url = "http://scriptsource.org/cms/scripts/page.php?item_id=script_detail&key={0}"
with open("iso15924codes.txt") as f:
codes = f.readlines()
codes = map(lambda c: c.strip(), codes)
pat = re.compile("Writing systems that use this script \((\d+)\)")
otherpat = re.compile("Writing systems that use this script")
langs = []
for code in codes:
print code,
fname = "scriptpages/" + code + ".html"
if os.path.isfile(fname):
# open and read
#print code, "file exists."
with open(fname) as f:
html = f.read()
else:
#print "gotta get it!"
response = urllib2.urlopen(url.format(code))
html = response.read()
with open(fname, "w") as out:
out.write(html)
groups = pat.search(html)
groups2 = otherpat.search(html)
if groups2 is not None:
if groups is not None:
print groups.group(1)
langs.append((code, int(groups.group(1))))
else:
print "prob 0"
langs.append((code, 0))
else:
print "weird"
langs.append((code, 0))
import matplotlib.pyplot as plt
langs = sorted(langs, key=lambda p: p[1], reverse=True)
# Hard coded.
langs.insert(0, ("Latn", 513))
langs = langs[:50]
x = range(len(langs))
y = map(lambda p: p[1], langs)
plt.plot(x, y)
plt.xlabel('Individual Script')
plt.ylabel('Num langs per script')
plt.title('Number of languages by script')
plt.show()
print sum(y)
Adlm 0
Afak 1
Aghb 0
Ahom 1
Arab 171
Aran 69
Armi 0
Armn 1
Avst 1
Bali 3
Bamu 1
Bass 1
Batk 6
Beng 39
Bhks 0
Blis 1
Bopo 6
Brah 0
Brai 97
Bugi 3
Buhd 1
Cakm 2
Cans 26
Cari 0
Cham 2
Cher 1
Cirt 0
Copt 4
Cprt 1
Cyrl 116
Cyrs 2
Deva 182
Dsrt 1
Dupl 0
Egyd 0
Egyh 0
Egyp 0
Elba 1
Ethi 43
Geok 1
Geor 6
Glag 1
Goth 1
Gran 0
Grek 12
Gujr 12
Guru 3
Hanb 0
Hang 1
Hani 24
Hano 1
Hans 14
Hant 13
Hatr 0
Hebr 24
Hira 0
Hluw 0
Hmng 4
Hrkt 0
Hung 0
Inds 0
Ital 3
Jamo 0
Java 5
Jpan 1
Jurc 0
Kali 2
Kana 3
Khar 0
Khmr 7
Khoj 4
Kitl 0
Kits 0
Knda 11
Kore 1
Kpel 1
Kthi 3
Lana 5
Laoo 19
Latf 2
Latg 1
Latn 513
Leke 0
Lepc 1
Limb 1
Lina 1
Linb 1
Lisu 3
Loma 1
Lyci 0
Lydi 0
Mahj 0
Mand 2
Mani 0
Marc 1
Maya 0
Mend 1
Merc 0
Mero 0
Mlym 10
Modi 0
Mong 11
Moon 0
Mroo 1
Mtei 1
Mult 0
Mymr 27
Narb 0
Nbat 0
Newa 0
Nkgb 1
Nkoo 4
Nshu 2
Ogam 3
Olck 1
Orkh 0
Orya 22
Osge 1
Osma 1
Palm 0
Pauc 0
Perm 2
Phag 5
Phli 0
Phlp 0
Phlv 0
Phnx 1
Piqd 0
Plrd 8
Prti 0
Qaaa 0
Qabx 0
Rjng 4
Roro 0
Runr 1
Samr 2
Sara 0
Sarb 0
Saur 1
Sgnw 12
Shaw 1
Shrd 2
Sidd 0
Sind 0
Sinh 3
Sora 1
Sund 1
Sylo 1
Syrc 9
Syre 0
Syrj 0
Syrn 0
Tagb 1
Takr 9
Tale 3
Talu 1
Taml 10
Tang 1
Tavt 5
Telu 19
Teng 0
Tfng 12
Tglg 3
Thaa 1
Thai 35
Tibt 33
Tirh 1
Ugar 1
Vaii 1
Visp 0
Wara 1
Wole 1
Xpeo 1
Xsux 1
Yiii 10
Zinh 0
Zmth 0
Zsye 0
Zsym 0
Zxxx 0
Zyyy 0
Zzzz 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment