Skip to content

Instantly share code, notes, and snippets.

@jcaxmacher
Created August 27, 2018 02:53
Show Gist options
  • Save jcaxmacher/7d333ef11d99d653279b3e3a825cd6db to your computer and use it in GitHub Desktop.
Save jcaxmacher/7d333ef11d99d653279b3e3a825cd6db to your computer and use it in GitHub Desktop.
Fetch url samples
import json
import urllib.request
import urllib.parse
# Global counter for samples from the same domain
DOMAIN_COUNTS = {}
def get_urls(filename):
"""Read URLs from sample url file.
Args:
filename (str): full path to sample url file
Returns:
List[str]: list of urls
"""
with open(filename, 'r') as f:
urls = [url.strip() for url in f]
return urls
def get_html(url):
"""Perform an HTTP GET on the provided url and return the html response.
Args:
url (str): url to request
Returns:
str: the http response text
"""
request = urllib.request.Request(
url,
data=None,
headers={
'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/35.0.1916.47 Safari/537.36')
}
)
response = urllib.request.urlopen(request)
return response.read()
def get_domain(url):
"""Extract domain name from a url.
Args:
url (str): url to parse
Returns:
str: the domain name part of the url
"""
parsed_uri = urllib.parse.urlparse(url)
return parsed_uri.netloc
def get_domain_index(domain):
"""Returns an index number for the next sample article of the given domain.
Args:
domain (str): the dns domain of the article
Returns:
int: the 1-based index of the article
"""
DOMAIN_COUNTS.setdefault(domain, 0)
DOMAIN_COUNTS[domain] += 1
return DOMAIN_COUNTS[domain]
def main():
filename = 'links.txt'
samples = []
for idx, url in enumerate(get_urls(filename)):
print(idx, url)
domain = get_domain(url)
index = get_domain_index(domain)
html_file = 'tests/fixtures/{}_{}.html'.format(domain, index)
html = get_html(url)
with open(html_file, 'wb') as f:
f.write(html)
samples.append({
'url': url,
'domain': domain,
'html_file': html_file
})
with open('tests/fixtures/samples.json', 'w') as f:
f.write(json.dumps(samples, indent=4))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment