Last active
July 2, 2022 12:26
-
-
Save salgo60/f33116056e2698186b9ee424626246fd to your computer and use it in GitHub Desktop.
Link rot in kulturarvsdata
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' User a query from Quarry that generates 164 143 links | |
the result is error in 7602 unique URLs | |
* Wikipedia diskussion https://sv.wikipedia.org/wiki/Malldiskussion:BBR-l%C3%A4nk#BeBR_410-poster_tillbaka | |
* Notebook https://github.com/salgo60/open-data-examples/blob/master/Icke%20fungerande%20RA%C3%84%20l%C3%A4nkar.ipynb | |
''' | |
import urllib3 | |
import sys,json | |
http = urllib3.PoolManager() | |
def get_file(): | |
filename = "/Users/magnus/Documents/GitHub/PythonDataScienceHandbook/notebooks/Ksamsok20200629.json" | |
with open(filename) as jsonfile: | |
data = json.load(jsonfile) | |
return data | |
return False | |
def write_file(NonWorkingURL): | |
with open('nonWorkingURLS.txt', 'w') as fw: | |
for x in NonWorkingURL: | |
fw.writelines(x) | |
''' Check if URLS is ok''' | |
def check(url): | |
r = http.request('GET', url) | |
if r.status != 200: | |
print("Status: ",r.status, " \t", url) | |
return False | |
return True | |
ok = 0 | |
notok = 0 | |
NonWorkingURL = set() | |
results = get_file() | |
for r in results["rows"]: | |
url = (r[0]) | |
if check(url): | |
ok +=1 | |
else: | |
notok +=1 | |
NonWorkingURL.add(url) | |
print("Ok: ",ok,"\tNot ok", notok) | |
write_file(NonWorkingURL) | |
new test 2020-11-20
160548
OK: 154158 not ok 6390
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
164143
Result: Ok: 156539 Not ok 7604
See