Skip to content

Instantly share code, notes, and snippets.

@zimmicz
Created August 11, 2017 05:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zimmicz/f69a5ce5d3cf3a220e171553c35e0391 to your computer and use it in GitHub Desktop.
Save zimmicz/f69a5ce5d3cf3a220e171553c35e0391 to your computer and use it in GitHub Desktop.
PostgreSQL docs scraper
# -*- coding: utf-8 -*-
#!/usr/bin/env python
import requests
from bs4 import BeautifulSoup
URL = "https://www.postgresql.org/docs/VERSION/static/sql-commands.html"
VERSIONS = ["10.0", "9.6", "9.5", "9.4", "9.3", "9.2", "9.1", "9.0", "8.4", "8.3", "8.2", "8.1", "8.0", "7.4", "7.3", "7.2", "7.1", "7.0"]
CAPS = {}
def replace_version_in_url(new_version):
return URL.replace("VERSION", new_version)
def fetch_page(url):
return requests.get(url).text
def make_soup(html):
return BeautifulSoup(html, "lxml")
def get_links_from_soup(soup):
links = []
_links = soup("a")
for link in _links:
try:
if link["href"].startswith("sql-"):
links.append(link["href"])
except KeyError as err:
pass
return links
def get_word_count_from_soup(soup):
synopsis = soup("pre")[0]
letter_count = 0
for s in synopsis.stripped_strings:
letter_count += len(s)
return letter_count
def get_title_from_soup(soup):
return soup("title")[0].string.split(": ")[-1]
def get_data_for_version(links, version):
data = {}
for link in links:
url = replace_version_in_url(version).rsplit("/")[0:-1]
url.extend([link])
url = "/".join(url)
page = fetch_page(url)
soup = make_soup(page)
title = get_title_from_soup(soup)
data[title] = get_word_count_from_soup(soup)
return data
for VERSION in VERSIONS:
print VERSION
url = replace_version_in_url(VERSION)
page = fetch_page(url)
soup = make_soup(page)
links = get_links_from_soup(soup)
CAPS[VERSION] = get_data_for_version(links, VERSION)
with open("data.csv", "a") as f:
for func, letter_count in CAPS[VERSION].items():
print func, letter_count
f.write(VERSION + ";" + func.strip() + ";" + str(letter_count) + "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment