Skip to content

Instantly share code, notes, and snippets.

@aniruddha-adhikary
Last active December 19, 2015 06:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aniruddha-adhikary/5915474 to your computer and use it in GitHub Desktop.
Save aniruddha-adhikary/5915474 to your computer and use it in GitHub Desktop.
A scraper for collecting all surahs of the Quran into JSON files using BeautifulSoup.
#!/usr/bin/env python
#-*- coding: utf-8 -*-
import urllib2
from BeautifulSoup import BeautifulSoup
def main():
linkslist = getAyahLinks()
sura_count = 0
for each_link in linkslist:
sura_count = sura_count + 1
writeAyah(each_link['href'], str(sura_count) + ".json", each_link.text)
print "Sura %d has been scraped (%s)." % (sura_count, each_link.text)
def getAyahLinks():
scrapeURL = "http://www.ourholyquran.com/index.php?option=com_content&view=article&id=53&Itemid=83"
souplinks = BeautifulSoup(urllib2.urlopen(scrapeURL).read())
return souplinks.find("ul", { "class" : "menu-arabic" }).findAll("a")
def writeAyah(ayah_url, outfile_name, sura_name):
souptext = BeautifulSoup(urllib2.urlopen("http://www.ourholyquran.com" + ayah_url).read())
ayahs = souptext.findAll("td", { "class" : "sura-arabic-text" })
outfile = open(outfile_name, 'a')
outfile.write("{\n\t\"name\": \"" + sura_name.encode('utf8') + "\"," + "\n\t\"ayahs\": [")
for each_ayah in ayahs:
outfile.write("\t\t\"" + each_ayah.text.replace("\"", """).encode('utf8') + "\",\n")
outfile.write("\t]\n}")
outfile.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment