Skip to content

Instantly share code, notes, and snippets.

@RobinRojowiec
Created March 3, 2019 21:36
Show Gist options
  • Save RobinRojowiec/d87c56e003c7bda85eccb257c8dbcff2 to your computer and use it in GitHub Desktop.
Save RobinRojowiec/d87c56e003c7bda85eccb257c8dbcff2 to your computer and use it in GitHub Desktop.
def parse_category(self, url, depth):
"""
Collects the links from a category and downloads/parses them
:param url:
:param depth:
:return:
"""
page_content = self.download_page(url)
if page_content is None:
return []
base_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
soup = BeautifulSoup(page_content, 'lxml')
pages = []
links = list(filter(lambda x: x.get('href') is not None, soup.find_all('a')))
for link in links:
url = link.get('href')
pages.extend(self.crawl(base_url + url, depth + 1))
return pages
def parse_page(self, url, depth=0):
"""
Downloads and parses a Wikipedia page and stores it if required
:param url:
:return:
"""
print("Parsing page: ", url)
page_content = self.download_page(url)
if page_content is None:
return []
soup = BeautifulSoup(page_content, 'lxml')
pages = []
page = WikipediaPage(url)
# extract wikipedia links
links = soup.find_all('a')
for link in links:
link_url = link.get('href')
if link_url is not None:
if self.wiki_page_link_pattern.match(link_url):
base_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
page.links.append(base_url + link_url)
pages.extend(self.crawl(base_url + link_url, depth + 1))
# extract paragraphs
text_container = soup.find('div', {'class': 'mw-parser-output'})
zero_paragaph = {"title": "", "text": ""}
current_paragraph = copy.deepcopy(zero_paragaph)
for child in text_container.children:
if child.name == "p":
current_paragraph["text"] += child.text + "\n"
elif child.name == "h2":
page.paragraphs.append(current_paragraph)
current_paragraph = copy.deepcopy(zero_paragaph)
current_paragraph["title"] = next(child.children).text
page.paragraphs = list(filter(lambda x: x["text"] != "", page.paragraphs))
# extract graphics
image_container = soup.find_all('div', {'class': 'thumbinner'})
zero_graphic = {"url": "", "caption": ""}
for image in image_container:
current_graphic = copy.deepcopy(zero_graphic)
for child in image.children:
if child.name == "a":
current_graphic["url"] = child.get('href')
elif child.name == "div":
current_graphic["caption"] = child.text
page.graphics.append(current_graphic)
toc_element = soup.find(id="toc")
if toc_element is not None:
page.table_of_contents = list(filter(lambda x: x != "", toc_element.text.split("\n")[1:]))
page.title = soup.find(id="firstHeading").text
page.html = str(soup)
if self.store_after_parsing:
page.store(self.directory)
pages.append(page)
return pages
def crawl(self, initial_link, depth=0):
if depth <= self.max_depth:
base_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(initial_link))
if base_url in self.valid_origins:
if self.category_link_pattern.match(initial_link[len(base_url):]):
return self.parse_category(initial_link, depth)
elif self.wiki_page_link_pattern.match(initial_link[len(base_url):]):
return self.parse_page(initial_link, depth)
return []
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment