Skip to content

Instantly share code, notes, and snippets.

@ilimugur
Last active May 10, 2016 22:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ilimugur/37c43d517b1bf9229a2027d5627e968a to your computer and use it in GitHub Desktop.
Save ilimugur/37c43d517b1bf9229a2027d5627e968a to your computer and use it in GitHub Desktop.
A script to help grab text-based content from any site where content is given as a list. It can be used to grab a list of links as well as a list of text-based content. Created originally to grab the paginated and list-based text content in URL http://www.zaytung.com/sondakika.asp, which is a Turkish parody news site similar to Onion News Network.
import urllib2
from BeautifulSoup import BeautifulSoup, NavigableString
import re
def getDomainURL(url):
slashToLookFor = 1
if url.find("http://") > -1 or url.find("https://") > -1:
slashToLookFor = 3
endIndex = 0
for i in range(0, slashToLookFor):
endIndex = url.find("/", endIndex) + 1
return url[:endIndex]
def get(url, listLocation, unitLocation, isLink = False, stripTagsInUnit = False):
'''
Search url and retrieve a desired list of links or content.
Keyword arguments:
listLocation -- list of dictionaries to specifically locate container of content
unitLocation -- list of dictionaries to specifically locate a unit of content in container
isLink -- boolean value indicating what is sought: href or content (default False)
Return value:
List of contents or URLs, based on the value of isLink argument.
Notes:
- Lists listLocation and unitLocation should only contain dictionaries.
- Each dict must have a value assigned for exactly 1 of the keys "id", "style", "class" or "tag".
- While parsing HTML, those keys are checked in a specific-to-general order for each dict.
- "id" is checked first, followed by "style", "class" and "tag", respectively.
'''
pageHTML = urllib2.urlopen(url)
soup = BeautifulSoup(pageHTML)
for container in listLocation:
if container.has_key("id"):
soup = soup.find(id = container["id"])
elif container.has_key("style"):
soup = soup.find(style = container["style"])
elif container.has_key("tag"):
classDict = {}
if container.has_key("class"):
classDict["class"] = container["class"]
soup = soup.find(container["tag"], classDict)
else:
print "Error!"
result = []
domainURL = getDomainURL(url)
soup = [soup]
for container in unitLocation:
new_soup = []
if container.has_key("id"):
for s in soup:
new_soup += s.findAll(id = container["id"])
elif container.has_key("style"):
for s in soup:
new_soup += s.findAll(style = container["style"])
elif container.has_key("tag"):
classDict = {}
if container.has_key("class"):
classDict["class"] = container["class"]
for s in soup:
new_soup += s.findAll(container["tag"], classDict)
else:
print "Error!"
soup = new_soup
for listElement in soup:
if isLink:
link = listElement.get("href")
if link.find("http://") == -1 and link.find("https://") == -1:
link = domainURL + link
resultUnit = link
else:
resultUnit = u""
for content in listElement.contents:
if type(content) is NavigableString:
resultUnit += unicode(content)
elif stripTagsInUnit:
resultUnit += re.sub("<.*?>", "", unicode(content))
result.append(resultUnit)
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment