Skip to content

Instantly share code, notes, and snippets.

@Jiali-Qi
Created September 16, 2019 17:33
Show Gist options
  • Save Jiali-Qi/5495d758a7fa659d987fddf4717be449 to your computer and use it in GitHub Desktop.
Save Jiali-Qi/5495d758a7fa659d987fddf4717be449 to your computer and use it in GitHub Desktop.
Assignment 2
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 16 11:55:28 2019
@author: qijia
"""
import requests
from bs4 import BeautifulSoup
import requests
import re
import csv
from requests.exceptions import RequestException
# Use BeautifulSoup, regular expressions, or some other means to extract the relevant domains.
url = 'https://en.m.wikipedia.org/wiki/List_of_Internet_top-level_domains'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
# Find tags and look for the place of top-level domains
all_tags = soup.prettify()
# Create a pattern to match top-level domains
pattern = re.compile('title=".*?">([.].*?)</a>')
soup = str(soup)
target = re.findall(pattern,soup)
target = target[:-1]
print(target)
# Write file into csv
with open('BIA660_HWK02.csv','w', encoding='utf-8') as file1:
writer = csv.writer(file1)
for i in target:
writer.writerow([i])
with open('BIA660_HWK02_part2.csv','w',encoding='utf-8') as file2:
writer = csv.writer(file2)
for i in target:
check_url = "http://example" + i
try:
check = requests.get(check_url)
domains_checked = {i:check.status_code}
print(domains_checked)
except RequestException:
domains_checked = {i:'error'}
print(domains_checked)
writer.writerow([domains_checked])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment