Skip to content

Instantly share code, notes, and snippets.

@srikanthlogic
Created June 9, 2012 11:14
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save srikanthlogic/2900594 to your computer and use it in GitHub Desktop.
Save srikanthlogic/2900594 to your computer and use it in GitHub Desktop.
Tamil Lexicon Scrapper
# -*- coding: utf-8 -*-
# Scrapper to get words from Tamil Lexicon
# By Srikanth Logic. (srik.lak@gmail.com)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from BeautifulSoup import BeautifulSoup
import html2text
import urllib2
import codecs
import csv
def getResults(url):
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
wordlist_in_page = soup.findAll('div2')
word_meanings = []
word = {'word':'meaning'}
i = 1
for oneword in wordlist_in_page:
if i == 1:
i+=1
continue
word[oneword.find('span').find('span').contents[0].__str__()] = html2text.html2text(oneword.find('p').__str__())
return word
def main():
#sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout)
fl = codecs.open('words.csv','wb','utf-8')
url = 'http://dsal.uchicago.edu/cgi-bin/philologic/getobject.pl?' + 'p.0:4.tamillex'
words_in_page = getResults(url)
writer = csv.writer(fl,delimiter='\t',dialect='excel',quoting=csv.QUOTE_ALL)
for word in words_in_page.keys():
writer.writerow((word,words_in_page[word]))
print 'Completed Page' + str(url)
fl.close()
if __name__ == "__main__":
main()
@abuvanth
Copy link

Traceback (most recent call last):
File "tla.py", line 50, in
main()
File "tla.py", line 42, in main
words_in_page = getResults(url)
File "tla.py", line 24, in getResults
page = urllib2.urlopen(url)
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 448, in error
return self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 500: Internal Server Error
desk-05@desk-05:~/abu$

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment