Skip to content

Instantly share code, notes, and snippets.

@srikanthlogic
Created June 9, 2012 11:14
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save srikanthlogic/2900594 to your computer and use it in GitHub Desktop.
Save srikanthlogic/2900594 to your computer and use it in GitHub Desktop.
Tamil Lexicon Scrapper
# -*- coding: utf-8 -*-
# Scrapper to get words from Tamil Lexicon
# By Srikanth Logic. (srik.lak@gmail.com)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from BeautifulSoup import BeautifulSoup
import html2text
import urllib2
import codecs
import csv
def getResults(url):
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
wordlist_in_page = soup.findAll('div2')
word_meanings = []
word = {'word':'meaning'}
i = 1
for oneword in wordlist_in_page:
if i == 1:
i+=1
continue
word[oneword.find('span').find('span').contents[0].__str__()] = html2text.html2text(oneword.find('p').__str__())
return word
def main():
#sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout)
fl = codecs.open('words.csv','wb','utf-8')
url = 'http://dsal.uchicago.edu/cgi-bin/philologic/getobject.pl?' + 'p.0:4.tamillex'
words_in_page = getResults(url)
writer = csv.writer(fl,delimiter='\t',dialect='excel',quoting=csv.QUOTE_ALL)
for word in words_in_page.keys():
writer.writerow((word,words_in_page[word]))
print 'Completed Page' + str(url)
fl.close()
if __name__ == "__main__":
main()
@tshrinivasan
Copy link

python dl.py
Traceback (most recent call last):
File "dl.py", line 50, in
main()
File "dl.py", line 42, in main
words_in_page = getResults(url)
File "dl.py", line 25, in getResults
soup = BeautifulSoup(page.read())
File "/usr/local/lib/python2.7/dist-packages/BeautifulSoup-3.2.1-py2.7.egg/BeautifulSoup.py", line 1522, in init
BeautifulStoneSoup.init(self, _args, *_kwargs)
File "/usr/local/lib/python2.7/dist-packages/BeautifulSoup-3.2.1-py2.7.egg/BeautifulSoup.py", line 1147, in init
self._feed(isHTML=isHTML)
File "/usr/local/lib/python2.7/dist-packages/BeautifulSoup-3.2.1-py2.7.egg/BeautifulSoup.py", line 1189, in feed
SGMLParser.feed(self, markup)
File "/usr/lib/python2.7/sgmllib.py", line 104, in feed
self.goahead(0)
File "/usr/lib/python2.7/sgmllib.py", line 143, in goahead
k = self.parse_endtag(i)
File "/usr/lib/python2.7/sgmllib.py", line 320, in parse_endtag
self.finish_endtag(tag)
File "/usr/lib/python2.7/sgmllib.py", line 358, in finish_endtag
method = getattr(self, 'end
' + tag)
UnicodeEncodeError: 'ascii' codec can't encode characters in position 10-25: ordinal not in range(128)

getting the above error on executing this code.

@abuvanth
Copy link

Traceback (most recent call last):
File "tla.py", line 50, in
main()
File "tla.py", line 42, in main
words_in_page = getResults(url)
File "tla.py", line 24, in getResults
page = urllib2.urlopen(url)
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 448, in error
return self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 500: Internal Server Error
desk-05@desk-05:~/abu$

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment