Skip to content

Instantly share code, notes, and snippets.

@da1910
Created May 11, 2021 14:04
Show Gist options
  • Save da1910/79c168294a8dfe2957a8cbc61daa1710 to your computer and use it in GitHub Desktop.
Save da1910/79c168294a8dfe2957a8cbc61daa1710 to your computer and use it in GitHub Desktop.
Test script to compare chardet and charset_normalizer
import chardet
import charset_normalizer
import requests
import csv
import re
import warnings
import tqdm
client = requests.Session()
results = []
charset_regex = re.compile("charset=([^;]+)")
# Charset_normalizer warns if the data contains encoding information
warnings.filterwarnings('ignore', 'Trying to detect')
with open("/URL_LIST.csv", 'r') as fp:
data = fp.readlines()
file_data = csv.reader(data)
# Comment if data has no heading row
file_data.__next__()
skipped = 0
no_encoding = 0
done = 0
for row in file_data:
# Change value if urls are not in the second column
url = row[1]
print("Trying url {0}".format(url))
try:
response = client.get("https://{0}".format(url), timeout=2)
except:
print("\tFailed https, trying http")
try:
response = client.get("http://{0}".format(url), timeout=2)
except:
print("\tFailed http, skipping")
skipped += 1
continue
done += 1
if 'Content-Type' in response.headers and 'charset' in response.headers['Content-Type']:
content_header = response.headers['Content-Type']
declared_encoding = charset_regex.search(content_header).group(1)
else:
no_encoding += 1
declared_encoding = None
chardet_encoding = chardet.detect(response.content)
normalizer_encoding = charset_normalizer.detect(response.content)
results.append([url, declared_encoding, chardet_encoding['encoding'], normalizer_encoding['encoding']])
print("Run finished, resolved {0} urls successfully, skipped {1}. {2} reported no encoding scheme".format(done, skipped,
no_encoding))
with open('./all_results.csv', 'w') as fp:
writer = csv.writer(fp)
writer.writerows(results)
with open('./differences.md', 'w') as fp:
fp.write("| URL | Chardet | Charset_Normalizer |\n")
fp.write("| --- | ------- | ------------------ |\n")
for row in results:
if row[1] is None:
fp.write("| {0} | {1} | {2} |\n".format(row[0], row[2], row[3]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment