twerp/functions.py

## functions.py
import os
import requests
from collections import namedtuple
from pprint import pprint
from urllib.parse import urlparse


user_agent = ''

Link = namedtuple("Link", "url title")

def remove_newline(input_str):
    if input_str.endswith('\n'):
        output_str = input_str[:-1]
    else:
        output_str = input_str
    return output_str

def print_file(f, lines_to_print):
    line_count = 0
    for raw_line in f:
        line_count += 1
        if line_count <= lines_to_print:
            line = remove_newline(raw_line)
            print(f"{line_count} {line}")
    print(f"\nNumber of lines: {line_count}")

def extract_link_from(f, progress=None, start_at=1):
    # a single line may have 0 or more links
    for line in f:
        if progress:
            progress['linecounter'] += 1
            if progress['linecounter'] < start_at:
                continue
        try:
            start_idx = -1
            while True:
                start_idx = line.index('[[', start_idx+1)
                end_idx = line.index(']]', start_idx)
                # print(f"{start_idx} - {end_idx}")
                link_str = line[start_idx+2:end_idx]
                sep_idx = link_str.find('|')
                if sep_idx > -1:
                    url = link_str[:sep_idx]
                    title = link_str[sep_idx+1:]
                else:
                    url = link_str
                    title = None
                # print(url)
                if url.startswith('http'):
                    if url.startswith('https://web.archive.org'):
                        yield   # ignore Wayback Machine links
                    elif 'news.ycombinator.com' in url:
                        yield   # ignore Hacker News links
                    else:
                        yield Link(url, title)
                else:
                    yield
        except ValueError:
            # line has 0 urls
            yield

def request(url):
    global user_agent
    return requests.get(url, headers={'User-Agent': user_agent})

def request_head(url):
    global user_agent
    return requests.head(url, headers={'User-Agent': user_agent})

def extract_owner_repo(url):
    parsed_url = urlparse(url)
    path = parsed_url.path.strip("/")
    owner, repo = path.split("/")[:2]
    return owner, repo

def process_file(f, user_agent, query_gh_api, progress=None, start_at=1):
    def set_user_agent(ua):
        global user_agent
        user_agent = ua
    set_user_agent(user_agent)
    urls = []
    for link in extract_link_from(f, progress, start_at):
        if not link:
            continue
        urls.append(link.url)

        try:
            r = request_head(link.url)
        except requests.exceptions.ConnectionError:
            print(f"ERR {link.url}")
            continue
        except requests.exceptions.SSLError:
            print(f"SSL {link.url}")
            continue

        headers = dict(r.headers.items())

        if r.status_code == 200:
            if query_gh_api and ('github.com' in link.url) and ('gist.' not in link.url):
                try:
                    owner, repo = extract_owner_repo(link.url)
                except ValueError:
                    # most likely not a repo url
                    continue
                '''
                except requests.exceptions.JSONDecodeError:
                    # comes up erratically, rerunning script should help
                except KeyError:
                    # GitHub API limit reached(?); unfortunately comes up too easily
                '''
                r2 = requests.get(f"https://api.github.com/repos/{owner}/{repo}/commits",
                                    params={'per_page': '1'},
                                    headers={'User-Agent': user_agent})
                if r2.status_code == 403:
                    print(f"403 FORBIDDEN github.com/{owner}/{repo}")
                    continue
                if r2.status_code != 404:
                    print(f"github.com/{owner}/{repo} latest commit at: {r2.json()[0]['commit']['author']['date']}")
            continue

        if r.status_code == 405:
            r2 = request(link.url)
            if r2.status_code != 200:
                print(f"{r2.status_code} {link.url}")
            continue

        print(f"{r.status_code} {link.url}")

        if r.status_code == 301 or r.status_code == 302:
            try:
                new_url = headers['Location']
            except KeyError:
                print(f"-> ???")
                continue
            if new_url.startswith('http'):
                try:
                    r2 = request_head(new_url)
                    print(f"-> {r2.status_code} {new_url}")
                except requests.exceptions.ConnectionError:
                    print(f"-> ERR {new_url}")
                except requests.exceptions.SSLError:
                    print(f"-> SSL {new_url}")
            else:
                print(f"-> TRY {new_url}")

        if r.status_code == 403:
            pprint(headers, indent=4)

    # print(f"Found {len(urls)} urls")

## run.py
# Note: on WSL Windows path separators will be "eaten" so you must use forward slashes!

MAX_LINES_TO_PRINT = 6  # Only used for debugging
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0'

progress = { 'linecounter': 0 }  # Using a dict enables us to modify the value in the functions module


import defopt
import logging
from functions import process_file, print_file


def main(filepath, *, user_agent=USER_AGENT, github=False, n=1):
    """
    Wiki-link-reporter v0.2

    :param str filepath: Path to file, e.g. data/pages/foo.txt
    :param str user_agent: User-Agent string (from your browser)
    :param bool github: Query GitHub API for info on the latest commits
    :param int n: Start working from line N
    """
    try:
        with open(filepath, encoding='utf-8') as f:
            # print()
            # print_file(f, MAX_LINES_TO_PRINT)
            process_file(f, user_agent, query_gh_api=github, progress=progress, start_at=n)
    except Exception as ex:
        logging.exception(f"Was on line {progress['linecounter']}")  # Gives us a traceback as well

if __name__ == '__main__':
        defopt.run(main)
	import os
	import requests
	from collections import namedtuple
	from pprint import pprint
	from urllib.parse import urlparse


	user_agent = ''

	Link = namedtuple("Link", "url title")

	def remove_newline(input_str):
	if input_str.endswith('\n'):
	output_str = input_str[:-1]
	else:
	output_str = input_str
	return output_str

	def print_file(f, lines_to_print):
	line_count = 0
	for raw_line in f:
	line_count += 1
	if line_count <= lines_to_print:
	line = remove_newline(raw_line)
	print(f"{line_count} {line}")
	print(f"\nNumber of lines: {line_count}")

	def extract_link_from(f, progress=None, start_at=1):
	# a single line may have 0 or more links
	for line in f:
	if progress:
	progress['linecounter'] += 1
	if progress['linecounter'] < start_at:
	continue
	try:
	start_idx = -1
	while True:
	start_idx = line.index('[[', start_idx+1)
	end_idx = line.index(']]', start_idx)
	# print(f"{start_idx} - {end_idx}")
	link_str = line[start_idx+2:end_idx]
	sep_idx = link_str.find('\|')
	if sep_idx > -1:
	url = link_str[:sep_idx]
	title = link_str[sep_idx+1:]
	else:
	url = link_str
	title = None
	# print(url)
	if url.startswith('http'):
	if url.startswith('https://web.archive.org'):
	yield # ignore Wayback Machine links
	elif 'news.ycombinator.com' in url:
	yield # ignore Hacker News links
	else:
	yield Link(url, title)
	else:
	yield
	except ValueError:
	# line has 0 urls
	yield

	def request(url):
	global user_agent
	return requests.get(url, headers={'User-Agent': user_agent})

	def request_head(url):
	global user_agent
	return requests.head(url, headers={'User-Agent': user_agent})

	def extract_owner_repo(url):
	parsed_url = urlparse(url)
	path = parsed_url.path.strip("/")
	owner, repo = path.split("/")[:2]
	return owner, repo

	def process_file(f, user_agent, query_gh_api, progress=None, start_at=1):
	def set_user_agent(ua):
	global user_agent
	user_agent = ua
	set_user_agent(user_agent)
	urls = []
	for link in extract_link_from(f, progress, start_at):
	if not link:
	continue
	urls.append(link.url)

	try:
	r = request_head(link.url)
	except requests.exceptions.ConnectionError:
	print(f"ERR {link.url}")
	continue
	except requests.exceptions.SSLError:
	print(f"SSL {link.url}")
	continue

	headers = dict(r.headers.items())

	if r.status_code == 200:
	if query_gh_api and ('github.com' in link.url) and ('gist.' not in link.url):
	try:
	owner, repo = extract_owner_repo(link.url)
	except ValueError:
	# most likely not a repo url
	continue
	'''
	except requests.exceptions.JSONDecodeError:
	# comes up erratically, rerunning script should help
	except KeyError:
	# GitHub API limit reached(?); unfortunately comes up too easily
	'''
	r2 = requests.get(f"https://api.github.com/repos/{owner}/{repo}/commits",
	params={'per_page': '1'},
	headers={'User-Agent': user_agent})
	if r2.status_code == 403:
	print(f"403 FORBIDDEN github.com/{owner}/{repo}")
	continue
	if r2.status_code != 404:
	print(f"github.com/{owner}/{repo} latest commit at: {r2.json()[0]['commit']['author']['date']}")
	continue

	if r.status_code == 405:
	r2 = request(link.url)
	if r2.status_code != 200:
	print(f"{r2.status_code} {link.url}")
	continue

	print(f"{r.status_code} {link.url}")

	if r.status_code == 301 or r.status_code == 302:
	try:
	new_url = headers['Location']
	except KeyError:
	print(f"-> ???")
	continue
	if new_url.startswith('http'):
	try:
	r2 = request_head(new_url)
	print(f"-> {r2.status_code} {new_url}")
	except requests.exceptions.ConnectionError:
	print(f"-> ERR {new_url}")
	except requests.exceptions.SSLError:
	print(f"-> SSL {new_url}")
	else:
	print(f"-> TRY {new_url}")

	if r.status_code == 403:
	pprint(headers, indent=4)

	# print(f"Found {len(urls)} urls")
	# Note: on WSL Windows path separators will be "eaten" so you must use forward slashes!

	MAX_LINES_TO_PRINT = 6 # Only used for debugging
	USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0'

	progress = { 'linecounter': 0 } # Using a dict enables us to modify the value in the functions module


	import defopt
	import logging
	from functions import process_file, print_file


	def main(filepath, *, user_agent=USER_AGENT, github=False, n=1):
	"""
	Wiki-link-reporter v0.2

	:param str filepath: Path to file, e.g. data/pages/foo.txt
	:param str user_agent: User-Agent string (from your browser)
	:param bool github: Query GitHub API for info on the latest commits
	:param int n: Start working from line N
	"""
	try:
	with open(filepath, encoding='utf-8') as f:
	# print()
	# print_file(f, MAX_LINES_TO_PRINT)
	process_file(f, user_agent, query_gh_api=github, progress=progress, start_at=n)
	except Exception as ex:
	logging.exception(f"Was on line {progress['linecounter']}") # Gives us a traceback as well

	if __name__ == '__main__':
	defopt.run(main)