Last active
May 10, 2016 22:52
-
-
Save ilimugur/37c43d517b1bf9229a2027d5627e968a to your computer and use it in GitHub Desktop.
A script to help grab text-based content from any site where content is given as a list. It can be used to grab a list of links as well as a list of text-based content. Created originally to grab the paginated and list-based text content in URL http://www.zaytung.com/sondakika.asp, which is a Turkish parody news site similar to Onion News Network.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
from BeautifulSoup import BeautifulSoup, NavigableString | |
import re | |
def getDomainURL(url): | |
slashToLookFor = 1 | |
if url.find("http://") > -1 or url.find("https://") > -1: | |
slashToLookFor = 3 | |
endIndex = 0 | |
for i in range(0, slashToLookFor): | |
endIndex = url.find("/", endIndex) + 1 | |
return url[:endIndex] | |
def get(url, listLocation, unitLocation, isLink = False, stripTagsInUnit = False): | |
''' | |
Search url and retrieve a desired list of links or content. | |
Keyword arguments: | |
listLocation -- list of dictionaries to specifically locate container of content | |
unitLocation -- list of dictionaries to specifically locate a unit of content in container | |
isLink -- boolean value indicating what is sought: href or content (default False) | |
Return value: | |
List of contents or URLs, based on the value of isLink argument. | |
Notes: | |
- Lists listLocation and unitLocation should only contain dictionaries. | |
- Each dict must have a value assigned for exactly 1 of the keys "id", "style", "class" or "tag". | |
- While parsing HTML, those keys are checked in a specific-to-general order for each dict. | |
- "id" is checked first, followed by "style", "class" and "tag", respectively. | |
''' | |
pageHTML = urllib2.urlopen(url) | |
soup = BeautifulSoup(pageHTML) | |
for container in listLocation: | |
if container.has_key("id"): | |
soup = soup.find(id = container["id"]) | |
elif container.has_key("style"): | |
soup = soup.find(style = container["style"]) | |
elif container.has_key("tag"): | |
classDict = {} | |
if container.has_key("class"): | |
classDict["class"] = container["class"] | |
soup = soup.find(container["tag"], classDict) | |
else: | |
print "Error!" | |
result = [] | |
domainURL = getDomainURL(url) | |
soup = [soup] | |
for container in unitLocation: | |
new_soup = [] | |
if container.has_key("id"): | |
for s in soup: | |
new_soup += s.findAll(id = container["id"]) | |
elif container.has_key("style"): | |
for s in soup: | |
new_soup += s.findAll(style = container["style"]) | |
elif container.has_key("tag"): | |
classDict = {} | |
if container.has_key("class"): | |
classDict["class"] = container["class"] | |
for s in soup: | |
new_soup += s.findAll(container["tag"], classDict) | |
else: | |
print "Error!" | |
soup = new_soup | |
for listElement in soup: | |
if isLink: | |
link = listElement.get("href") | |
if link.find("http://") == -1 and link.find("https://") == -1: | |
link = domainURL + link | |
resultUnit = link | |
else: | |
resultUnit = u"" | |
for content in listElement.contents: | |
if type(content) is NavigableString: | |
resultUnit += unicode(content) | |
elif stripTagsInUnit: | |
resultUnit += re.sub("<.*?>", "", unicode(content)) | |
result.append(resultUnit) | |
return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment