Skip to content

Instantly share code, notes, and snippets.

@j2labs
Created October 25, 2009 17:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save j2labs/218144 to your computer and use it in GitHub Desktop.
Save j2labs/218144 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from BeautifulSoup import BeautifulSoup
import urllib
class MozillaOpener(urllib.FancyURLopener):
version = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; sv-SE; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3'
urllib._urlopener = MozillaOpener()
url = urllib.urlopen('http://ur.wikipedia.org/wiki/%D8%A7%D8%B1%D8%AF%D9%88')
page = url.read()
soup = BeautifulSoup(page)
p_texts = list()
for p_markup in soup.findAll('p'):
p_nomarkup = p_markup.findAll(text=True)
p_text = ''.join(p_nomarkup)
p_texts.append(p_text)
urdu_text = '\n\n'.join(p_texts)
f = open('urdu.txt', 'w')
f.write(urdu_text.encode('utf8'))
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment