fee1-dead/twitter.py

## twitter.py
# MIT or Apache-2
import bs4
import regex as re
import wayback

import mwparserfromhell
import pywikibot
from pywikibot.bot import (ExistingPageBot, SingleSiteBot)
from pywikibot.pagegenerators import SearchPageGenerator

from urllib import parse
import datetime


class ScannerBot(ExistingPageBot, SingleSiteBot):
    update_options = {
        'clean': False,
        'create': False,
        'merge': False,
    }

    def __init__(self) -> None:
        site = pywikibot.Site(fam='wikipedia')
        self.wayback = wayback.WaybackClient()
        super().__init__(site=site,
                         generator=SearchPageGenerator(
                             r'insource:/twitter\.com\/[a-zA-Z0-9]+\/status\/[0-9]+\/?\?([st]|cxt|ref_[a-z]+)=/', site=site))

    def treat_page(self) -> None:
        if self.current_page.namespace() != 0:
            return

        bad_params = ['cxt', 'ref_src', 'ref_url', 's', 't']
        pattern = r"(?<!\?url=|/|cache:)https?://(?:mobile\.)?twitter\.com/\w+/status/\d+\?[^\s}<|]+"
        newtext = str(self.current_page.text)
        matches = list(re.finditer(pattern, newtext))
        changed = False
        cnt = 0
        for m in reversed(matches):  # <- reversed iteration to ensure the indices work as intended
            m: re.Match = m
            # deleting parameters.
            url = parse.urlparse(newtext[m.start():m.end()])
            qs = parse.parse_qs(url.query)
            for param in bad_params:
                qs.pop(param, None)
            url = url._replace(query=parse.urlencode(qs, doseq=True))
            new_url = parse.urlunparse(url)
            if new_url != m.string:
                components = [newtext[:m.start()], newtext[m.end():]]
                newtext = new_url.join(components)
                cnt += 1
                changed = True

        waybackpat = r"https?://web\.archive\.org/web/([0-9]+)/(https?://(?:mobile\.)?twitter\.com/\w+/status/\d+(?:\?[^\s}<|]+)?)"
        waybackcnt = 0
        code = mwparserfromhell.parse(newtext)
        for template in code.filter_templates():
            template: mwparserfromhell.nodes.Template = template
            if not template.name.matches("cite web") and not template.name.matches("cite tweet"):
                continue
            if not template.has("archive-url"):
                continue
            p: mwparserfromhell.nodes.template.Parameter = template.get('archive-url')
            val = str(p.value).strip()
            m = re.fullmatch(waybackpat, val)
            if m is None:
                continue
            timestamp = m.group(1)
            url = parse.urlparse(m.group(2))
            qs = parse.parse_qs(url.query)
            for param in bad_params:
                qs.pop(param, None)
            url = url._replace(query=parse.urlencode(qs, doseq=True))
            new_url = parse.urlunparse(url)
            if new_url == m.group(2):
                continue
            # get an archive that is within the target window of 1 year.
            try:
                memento = self.wayback.get_memento(new_url,
                                                   datetime=datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S'),
                                                   exact=False, target_window=365 * 24 * 3600)
            except wayback.exceptions.WaybackException:
                continue
            if memento is None or not memento.ok:
                continue
            archive_url = memento.memento_url
            timestamp = memento.timestamp
            soup = bs4.BeautifulSoup(memento.text, features="html.parser")
            if soup.title is None:
                continue
            print(soup.title.string)
            if soup.title.string == 'Twitter':  # these don't have a title and buggy
                continue
            api_pat = r"http://web\.archive\.org/web/([0-9]+)id_/(.+)"
            m = re.fullmatch(api_pat, archive_url)
            url = f'https://web.archive.org/web/{m.group(1)}/{m.group(2)}'
            template.add("archive-url", url)
            template.add("archive-date", timestamp.strftime("%Y-%m-%d"))
            waybackcnt += 1
            changed = True
        newtext = code

        if changed:
            msg = "Removing Twitter tracker params ([[Wikipedia:Bots/Requests_for_approval/ScannerBot|BRFA]])"
            cmt = ""
            if cnt > 0:
                cmt += f"{cnt} link(s) fixed"
            if waybackcnt > 0:
                if len(cmt) > 0:
                    cmt += ", "
                cmt += f"{waybackcnt} archive link(s) changed"
            summary = f"{msg} ({cmt})"
            self.current_page.text = newtext
            self.current_page.save(summary=summary, minor=True, botflag=True)


if __name__ == '__main__':
    ScannerBot().run()
	# MIT or Apache-2
	import bs4
	import regex as re
	import wayback

	import mwparserfromhell
	import pywikibot
	from pywikibot.bot import (ExistingPageBot, SingleSiteBot)
	from pywikibot.pagegenerators import SearchPageGenerator

	from urllib import parse
	import datetime


	class ScannerBot(ExistingPageBot, SingleSiteBot):
	update_options = {
	'clean': False,
	'create': False,
	'merge': False,
	}

	def __init__(self) -> None:
	site = pywikibot.Site(fam='wikipedia')
	self.wayback = wayback.WaybackClient()
	super().__init__(site=site,
	generator=SearchPageGenerator(
	r'insource:/twitter\.com\/[a-zA-Z0-9]+\/status\/[0-9]+\/?\?([st]\|cxt\|ref_[a-z]+)=/', site=site))

	def treat_page(self) -> None:
	if self.current_page.namespace() != 0:
	return

	bad_params = ['cxt', 'ref_src', 'ref_url', 's', 't']
	pattern = r"(?<!\?url=\|/\|cache:)https?://(?:mobile\.)?twitter\.com/\w+/status/\d+\?[^\s}<\|]+"
	newtext = str(self.current_page.text)
	matches = list(re.finditer(pattern, newtext))
	changed = False
	cnt = 0
	for m in reversed(matches): # <- reversed iteration to ensure the indices work as intended
	m: re.Match = m
	# deleting parameters.
	url = parse.urlparse(newtext[m.start():m.end()])
	qs = parse.parse_qs(url.query)
	for param in bad_params:
	qs.pop(param, None)
	url = url._replace(query=parse.urlencode(qs, doseq=True))
	new_url = parse.urlunparse(url)
	if new_url != m.string:
	components = [newtext[:m.start()], newtext[m.end():]]
	newtext = new_url.join(components)
	cnt += 1
	changed = True

	waybackpat = r"https?://web\.archive\.org/web/([0-9]+)/(https?://(?:mobile\.)?twitter\.com/\w+/status/\d+(?:\?[^\s}<\|]+)?)"
	waybackcnt = 0
	code = mwparserfromhell.parse(newtext)
	for template in code.filter_templates():
	template: mwparserfromhell.nodes.Template = template
	if not template.name.matches("cite web") and not template.name.matches("cite tweet"):
	continue
	if not template.has("archive-url"):
	continue
	p: mwparserfromhell.nodes.template.Parameter = template.get('archive-url')
	val = str(p.value).strip()
	m = re.fullmatch(waybackpat, val)
	if m is None:
	continue
	timestamp = m.group(1)
	url = parse.urlparse(m.group(2))
	qs = parse.parse_qs(url.query)
	for param in bad_params:
	qs.pop(param, None)
	url = url._replace(query=parse.urlencode(qs, doseq=True))
	new_url = parse.urlunparse(url)
	if new_url == m.group(2):
	continue
	# get an archive that is within the target window of 1 year.
	try:
	memento = self.wayback.get_memento(new_url,
	datetime=datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S'),
	exact=False, target_window=365 * 24 * 3600)
	except wayback.exceptions.WaybackException:
	continue
	if memento is None or not memento.ok:
	continue
	archive_url = memento.memento_url
	timestamp = memento.timestamp
	soup = bs4.BeautifulSoup(memento.text, features="html.parser")
	if soup.title is None:
	continue
	print(soup.title.string)
	if soup.title.string == 'Twitter': # these don't have a title and buggy
	continue
	api_pat = r"http://web\.archive\.org/web/([0-9]+)id_/(.+)"
	m = re.fullmatch(api_pat, archive_url)
	url = f'https://web.archive.org/web/{m.group(1)}/{m.group(2)}'
	template.add("archive-url", url)
	template.add("archive-date", timestamp.strftime("%Y-%m-%d"))
	waybackcnt += 1
	changed = True
	newtext = code

	if changed:
	msg = "Removing Twitter tracker params ([[Wikipedia:Bots/Requests_for_approval/ScannerBot\|BRFA]])"
	cmt = ""
	if cnt > 0:
	cmt += f"{cnt} link(s) fixed"
	if waybackcnt > 0:
	if len(cmt) > 0:
	cmt += ", "
	cmt += f"{waybackcnt} archive link(s) changed"
	summary = f"{msg} ({cmt})"
	self.current_page.text = newtext
	self.current_page.save(summary=summary, minor=True, botflag=True)


	if __name__ == '__main__':
	ScannerBot().run()