jcaxmacher/fetch_samples.py

## fetch_samples.py
import json
import urllib.request
import urllib.parse


# Global counter for samples from the same domain
DOMAIN_COUNTS = {}


def get_urls(filename):
    """Read URLs from sample url file.

    Args:
        filename (str): full path to sample url file

    Returns:
        List[str]: list of urls
    """
    with open(filename, 'r') as f:
        urls = [url.strip() for url in f]
    return urls


def get_html(url):
    """Perform an HTTP GET on the provided url and return the html response.

    Args:
        url (str): url to request

    Returns:
        str: the http response text
    """
    request = urllib.request.Request(
        url,
        data=None,
        headers={
            'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) '
                           'AppleWebKit/537.36 (KHTML, like Gecko) '
                           'Chrome/35.0.1916.47 Safari/537.36')
        }
    )
    response = urllib.request.urlopen(request)
    return response.read()


def get_domain(url):
    """Extract domain name from a url.

    Args:
        url (str): url to parse

    Returns:
        str: the domain name part of the url
    """
    parsed_uri = urllib.parse.urlparse(url)
    return parsed_uri.netloc


def get_domain_index(domain):
    """Returns an index number for the next sample article of the given domain.

    Args:
        domain (str): the dns domain of the article

    Returns:
        int: the 1-based index of the article
    """
    DOMAIN_COUNTS.setdefault(domain, 0)
    DOMAIN_COUNTS[domain] += 1
    return DOMAIN_COUNTS[domain]


def main():
    filename = 'links.txt'
    samples = []
    for idx, url in enumerate(get_urls(filename)):
        print(idx, url)
        domain = get_domain(url)
        index = get_domain_index(domain)
        html_file = 'tests/fixtures/{}_{}.html'.format(domain, index)
        html = get_html(url)
        with open(html_file, 'wb') as f:
            f.write(html)
        samples.append({
            'url': url,
            'domain': domain,
            'html_file': html_file
        })
    with open('tests/fixtures/samples.json', 'w') as f:
        f.write(json.dumps(samples, indent=4))


if __name__ == '__main__':
    main()
	import json
	import urllib.request
	import urllib.parse


	# Global counter for samples from the same domain
	DOMAIN_COUNTS = {}


	def get_urls(filename):
	"""Read URLs from sample url file.

	Args:
	filename (str): full path to sample url file

	Returns:
	List[str]: list of urls
	"""
	with open(filename, 'r') as f:
	urls = [url.strip() for url in f]
	return urls


	def get_html(url):
	"""Perform an HTTP GET on the provided url and return the html response.

	Args:
	url (str): url to request

	Returns:
	str: the http response text
	"""
	request = urllib.request.Request(
	url,
	data=None,
	headers={
	'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) '
	'AppleWebKit/537.36 (KHTML, like Gecko) '
	'Chrome/35.0.1916.47 Safari/537.36')
	}
	)
	response = urllib.request.urlopen(request)
	return response.read()


	def get_domain(url):
	"""Extract domain name from a url.

	Args:
	url (str): url to parse

	Returns:
	str: the domain name part of the url
	"""
	parsed_uri = urllib.parse.urlparse(url)
	return parsed_uri.netloc


	def get_domain_index(domain):
	"""Returns an index number for the next sample article of the given domain.

	Args:
	domain (str): the dns domain of the article

	Returns:
	int: the 1-based index of the article
	"""
	DOMAIN_COUNTS.setdefault(domain, 0)
	DOMAIN_COUNTS[domain] += 1
	return DOMAIN_COUNTS[domain]


	def main():
	filename = 'links.txt'
	samples = []
	for idx, url in enumerate(get_urls(filename)):
	print(idx, url)
	domain = get_domain(url)
	index = get_domain_index(domain)
	html_file = 'tests/fixtures/{}_{}.html'.format(domain, index)
	html = get_html(url)
	with open(html_file, 'wb') as f:
	f.write(html)
	samples.append({
	'url': url,
	'domain': domain,
	'html_file': html_file
	})
	with open('tests/fixtures/samples.json', 'w') as f:
	f.write(json.dumps(samples, indent=4))


	if __name__ == '__main__':
	main()