da1910/test_encoding.py

## test_encoding.py
import chardet
import charset_normalizer
import requests
import csv
import re
import warnings
import tqdm

client = requests.Session()
results = []
charset_regex = re.compile("charset=([^;]+)")

# Charset_normalizer warns if the data contains encoding information
warnings.filterwarnings('ignore', 'Trying to detect')

with open("/URL_LIST.csv", 'r') as fp:
    data = fp.readlines()
    file_data = csv.reader(data)

# Comment if data has no heading row
file_data.__next__()

skipped = 0
no_encoding = 0
done = 0

for row in file_data:
    # Change value if urls are not in the second column
    url = row[1]
    print("Trying url {0}".format(url))
    try:
        response = client.get("https://{0}".format(url), timeout=2)
    except:
        print("\tFailed https, trying http")
        try:
            response = client.get("http://{0}".format(url), timeout=2)
        except:
            print("\tFailed http, skipping")
            skipped += 1
            continue
    done += 1
    if 'Content-Type' in response.headers and 'charset' in response.headers['Content-Type']:
        content_header = response.headers['Content-Type']
        declared_encoding = charset_regex.search(content_header).group(1)
    else:
        no_encoding += 1
        declared_encoding = None
    chardet_encoding = chardet.detect(response.content)
    normalizer_encoding = charset_normalizer.detect(response.content)
    results.append([url, declared_encoding, chardet_encoding['encoding'], normalizer_encoding['encoding']])

print("Run finished, resolved {0} urls successfully, skipped {1}. {2} reported no encoding scheme".format(done, skipped,
                                                                                                          no_encoding))

with open('./all_results.csv', 'w') as fp:
    writer = csv.writer(fp)
    writer.writerows(results)

with open('./differences.md', 'w') as fp:
    fp.write("| URL | Chardet | Charset_Normalizer |\n")
    fp.write("| --- | ------- | ------------------ |\n")
    for row in results:
        if row[1] is None:
            fp.write("| {0} | {1} | {2} |\n".format(row[0], row[2], row[3]))
	import chardet
	import charset_normalizer
	import requests
	import csv
	import re
	import warnings
	import tqdm

	client = requests.Session()
	results = []
	charset_regex = re.compile("charset=([^;]+)")

	# Charset_normalizer warns if the data contains encoding information
	warnings.filterwarnings('ignore', 'Trying to detect')

	with open("/URL_LIST.csv", 'r') as fp:
	data = fp.readlines()
	file_data = csv.reader(data)

	# Comment if data has no heading row
	file_data.__next__()

	skipped = 0
	no_encoding = 0
	done = 0

	for row in file_data:
	# Change value if urls are not in the second column
	url = row[1]
	print("Trying url {0}".format(url))
	try:
	response = client.get("https://{0}".format(url), timeout=2)
	except:
	print("\tFailed https, trying http")
	try:
	response = client.get("http://{0}".format(url), timeout=2)
	except:
	print("\tFailed http, skipping")
	skipped += 1
	continue
	done += 1
	if 'Content-Type' in response.headers and 'charset' in response.headers['Content-Type']:
	content_header = response.headers['Content-Type']
	declared_encoding = charset_regex.search(content_header).group(1)
	else:
	no_encoding += 1
	declared_encoding = None
	chardet_encoding = chardet.detect(response.content)
	normalizer_encoding = charset_normalizer.detect(response.content)
	results.append([url, declared_encoding, chardet_encoding['encoding'], normalizer_encoding['encoding']])

	print("Run finished, resolved {0} urls successfully, skipped {1}. {2} reported no encoding scheme".format(done, skipped,
	no_encoding))

	with open('./all_results.csv', 'w') as fp:
	writer = csv.writer(fp)
	writer.writerows(results)

	with open('./differences.md', 'w') as fp:
	fp.write("\| URL \| Chardet \| Charset_Normalizer \|\n")
	fp.write("\| --- \| ------- \| ------------------ \|\n")
	for row in results:
	if row[1] is None:
	fp.write("\| {0} \| {1} \| {2} \|\n".format(row[0], row[2], row[3]))