-
-
Save zweizeichen/10bca3803b54070090ac48f5173910b2 to your computer and use it in GitHub Desktop.
import sqlite3 | |
import tldextract | |
history_domains = set() | |
cf_domains = None | |
print("Loading domains from Chrome browsing history...") | |
# Copy history from ~/Library/Application Support/Google/Chrome/Default/History | |
conn = sqlite3.connect('History') | |
c = conn.cursor() | |
for url in c.execute("SELECT url FROM urls"): | |
history_domains.add(tldextract.extract(url[0]).registered_domain) | |
print("Added %d domains." % len(history_domains)) | |
print("Loading Cloudflare domains...") | |
# Get domains here: https://github.com/pirate/sites-using-cloudflare | |
cf_domains = set(domain.strip() for domain in open('sorted_unique_cf.txt')) | |
print("Added %d domains." % len(cf_domains)) | |
print("Processing intersection...") | |
intersection = history_domains.intersection(cf_domains) | |
print("------------------------------") | |
for domain in sorted(intersection): | |
print(domain) | |
print("------------------------------\nOK: %d domains found." % len(intersection)) |
import sqlite3 | |
import tldextract | |
history_domains = set() | |
cf_domains = None | |
print("Loading domains from Firefox browsing history...") | |
# Copy history from ~/Library/Application Support/Firefox/Profiles/*YOUR PROFILE*/places.sqlite | |
conn = sqlite3.connect('places.sqlite') | |
c = conn.cursor() | |
for url in c.execute("SELECT url FROM moz_places"): | |
history_domains.add(tldextract.extract(url[0]).registered_domain) | |
print("Added %d domains." % len(history_domains)) | |
print("Loading Cloudflare domains...") | |
# Get domains here: https://github.com/pirate/sites-using-cloudflare | |
cf_domains = set(domain.strip() for domain in open('sorted_unique_cf.txt')) | |
print("Added %d domains." % len(cf_domains)) | |
print("Processing intersection...") | |
intersection = history_domains.intersection(cf_domains) | |
print("------------------------------") | |
for domain in sorted(intersection): | |
print(domain) | |
print("------------------------------\nOK: %d domains found." % len(intersection)) |
tldextract |
import sqlite3 | |
import tldextract | |
history_domains = set() | |
cf_domains = None | |
print("Loading domains from Safari browsing history...") | |
# Copy history from ~/Library/Safari/History.db | |
conn = sqlite3.connect('History.db') | |
c = conn.cursor() | |
for url in c.execute("SELECT url FROM history_items"): | |
history_domains.add(tldextract.extract(url[0]).registered_domain) | |
print("Added %d domains." % len(history_domains)) | |
print("Loading Cloudflare domains...") | |
# Get domains here: https://github.com/pirate/sites-using-cloudflare | |
cf_domains = set(domain.strip() for domain in open('sorted_unique_cf.txt')) | |
print("Added %d domains." % len(cf_domains)) | |
print("Processing intersection...") | |
intersection = history_domains.intersection(cf_domains) | |
print("------------------------------") | |
for domain in sorted(intersection): | |
print(domain) | |
print("------------------------------\nOK: %d domains found." % len(intersection)) |
@sinnfeinn:
I guess you did not change the code properly to point to your Chrome History.
Maybe there's astral plan unicode characters in my history? I tried using decode('unicode-escape')
as explained here http://stackoverflow.com/questions/7105874/valueerror-unichr-arg-not-in-range0x10000-narrow-python-build-please-hel
but to no avail.
Any pointers appreciated.
Loading domains from Safari browsing history...
No handlers could be found for logger "tldextract"
Traceback (most recent call last):
File "safari.py", line 13, in <module>
history_domains.add(tldextract.extract(url[0]).registered_domain)
File "/Library/Python/2.7/site-packages/tldextract/tldextract.py", line 329, in extract
return TLD_EXTRACTOR(url)
File "/Library/Python/2.7/site-packages/tldextract/tldextract.py", line 200, in __call__
translation = idna.decode(label.encode('ascii'))
File "/Library/Python/2.7/site-packages/idna/core.py", line 384, in decode
result.append(ulabel(label))
File "/Library/Python/2.7/site-packages/idna/core.py", line 302, in ulabel
label = label.decode('punycode')
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/encodings/punycode.py", line 208, in decode
res = punycode_decode(input, errors)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/encodings/punycode.py", line 195, in punycode_decode
return insertion_sort(base, extended, errors)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/encodings/punycode.py", line 180, in insertion_sort
base = base[:pos] + unichr(char) + base[pos:]
ValueError: unichr() arg not in range(0x10000) (narrow Python build)```
Here is what I had to do to get safari.py working on a relatively new OS X 10.11.6 install:
-
sudo easy_install pip
-
sudo pip install tldextract
-
curl https://raw.githubusercontent.com/pirate/sites-using-cloudflare/master/sorted_unique_cf.txt --output /Users/YOUR_USERNAME/Desktop/sorted_unique_cf.txt
-
In safari.py, change
conn = sqlite3.connect('History')
to
conn = sqlite3.connect('/Users/YOUR_USERNAME/Library/Safari/History.db')
and
sorted_unique_cf.txt
to
/Users/YOUR_USERNAME/Desktop/sorted_unique_cf.txt
@tinyapps, @Payuing, @skypather
Thanks! Changing it to the following worked:
conn = sqlite3.connect('/Users/YOUR_USER/Library/Application Support/Google/Chrome/Default/History')
and
cf_domains = set(domain.strip() for domain in open('/Users/YOUR_USER/Desktop/sorted_unique_cf.txt'))
for chrome.py I get:
File "/Users/andreas/Desktop/chrome.py", line 12, in <module> for url in c.execute("SELECT url FROM urls"): sqlite3.OperationalError: no such table: urls