Skip to content

Instantly share code, notes, and snippets.

@twerp
Last active March 13, 2024 14:58
Show Gist options
  • Save twerp/21da5ec99dc0614270403da36d522714 to your computer and use it in GitHub Desktop.
Save twerp/21da5ec99dc0614270403da36d522714 to your computer and use it in GitHub Desktop.
Link-checker / link-reporter for dokuwiki text files (WIP; Python3.6)
import os
import requests
from collections import namedtuple
from pprint import pprint
from urllib.parse import urlparse
user_agent = ''
Link = namedtuple("Link", "url title")
def remove_newline(input_str):
if input_str.endswith('\n'):
output_str = input_str[:-1]
else:
output_str = input_str
return output_str
def print_file(f, lines_to_print):
line_count = 0
for raw_line in f:
line_count += 1
if line_count <= lines_to_print:
line = remove_newline(raw_line)
print(f"{line_count} {line}")
print(f"\nNumber of lines: {line_count}")
def extract_link_from(f, progress=None, start_at=1):
# a single line may have 0 or more links
for line in f:
if progress:
progress['linecounter'] += 1
if progress['linecounter'] < start_at:
continue
try:
start_idx = -1
while True:
start_idx = line.index('[[', start_idx+1)
end_idx = line.index(']]', start_idx)
# print(f"{start_idx} - {end_idx}")
link_str = line[start_idx+2:end_idx]
sep_idx = link_str.find('|')
if sep_idx > -1:
url = link_str[:sep_idx]
title = link_str[sep_idx+1:]
else:
url = link_str
title = None
# print(url)
if url.startswith('http'):
if url.startswith('https://web.archive.org'):
yield # ignore Wayback Machine links
elif 'news.ycombinator.com' in url:
yield # ignore Hacker News links
else:
yield Link(url, title)
else:
yield
except ValueError:
# line has 0 urls
yield
def request(url):
global user_agent
return requests.get(url, headers={'User-Agent': user_agent})
def request_head(url):
global user_agent
return requests.head(url, headers={'User-Agent': user_agent})
def extract_owner_repo(url):
parsed_url = urlparse(url)
path = parsed_url.path.strip("/")
owner, repo = path.split("/")[:2]
return owner, repo
def process_file(f, user_agent, query_gh_api, progress=None, start_at=1):
def set_user_agent(ua):
global user_agent
user_agent = ua
set_user_agent(user_agent)
urls = []
for link in extract_link_from(f, progress, start_at):
if not link:
continue
urls.append(link.url)
try:
r = request_head(link.url)
except requests.exceptions.ConnectionError:
print(f"ERR {link.url}")
continue
except requests.exceptions.SSLError:
print(f"SSL {link.url}")
continue
headers = dict(r.headers.items())
if r.status_code == 200:
if query_gh_api and ('github.com' in link.url) and ('gist.' not in link.url):
try:
owner, repo = extract_owner_repo(link.url)
except ValueError:
# most likely not a repo url
continue
'''
except requests.exceptions.JSONDecodeError:
# comes up erratically, rerunning script should help
except KeyError:
# GitHub API limit reached(?); unfortunately comes up too easily
'''
r2 = requests.get(f"https://api.github.com/repos/{owner}/{repo}/commits",
params={'per_page': '1'},
headers={'User-Agent': user_agent})
if r2.status_code == 403:
print(f"403 FORBIDDEN github.com/{owner}/{repo}")
continue
if r2.status_code != 404:
print(f"github.com/{owner}/{repo} latest commit at: {r2.json()[0]['commit']['author']['date']}")
continue
if r.status_code == 405:
r2 = request(link.url)
if r2.status_code != 200:
print(f"{r2.status_code} {link.url}")
continue
print(f"{r.status_code} {link.url}")
if r.status_code == 301 or r.status_code == 302:
try:
new_url = headers['Location']
except KeyError:
print(f"-> ???")
continue
if new_url.startswith('http'):
try:
r2 = request_head(new_url)
print(f"-> {r2.status_code} {new_url}")
except requests.exceptions.ConnectionError:
print(f"-> ERR {new_url}")
except requests.exceptions.SSLError:
print(f"-> SSL {new_url}")
else:
print(f"-> TRY {new_url}")
if r.status_code == 403:
pprint(headers, indent=4)
# print(f"Found {len(urls)} urls")
# Note: on WSL Windows path separators will be "eaten" so you must use forward slashes!
MAX_LINES_TO_PRINT = 6 # Only used for debugging
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0'
progress = { 'linecounter': 0 } # Using a dict enables us to modify the value in the functions module
import defopt
import logging
from functions import process_file, print_file
def main(filepath, *, user_agent=USER_AGENT, github=False, n=1):
"""
Wiki-link-reporter v0.2
:param str filepath: Path to file, e.g. data/pages/foo.txt
:param str user_agent: User-Agent string (from your browser)
:param bool github: Query GitHub API for info on the latest commits
:param int n: Start working from line N
"""
try:
with open(filepath, encoding='utf-8') as f:
# print()
# print_file(f, MAX_LINES_TO_PRINT)
process_file(f, user_agent, query_gh_api=github, progress=progress, start_at=n)
except Exception as ex:
logging.exception(f"Was on line {progress['linecounter']}") # Gives us a traceback as well
if __name__ == '__main__':
defopt.run(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment