Skip to content

Instantly share code, notes, and snippets.

@fee1-dead
Last active December 29, 2022 17:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fee1-dead/8428cd954b55d83043f94a1753e91a18 to your computer and use it in GitHub Desktop.
Save fee1-dead/8428cd954b55d83043f94a1753e91a18 to your computer and use it in GitHub Desktop.
I hereby release the following source code under the Apache License 2.0 or under the MIT license, at your option.
# MIT or Apache-2
import bs4
import regex as re
import wayback
import mwparserfromhell
import pywikibot
from pywikibot.bot import (ExistingPageBot, SingleSiteBot)
from pywikibot.pagegenerators import SearchPageGenerator
from urllib import parse
import datetime
class ScannerBot(ExistingPageBot, SingleSiteBot):
update_options = {
'clean': False,
'create': False,
'merge': False,
}
def __init__(self) -> None:
site = pywikibot.Site(fam='wikipedia')
self.wayback = wayback.WaybackClient()
super().__init__(site=site,
generator=SearchPageGenerator(
r'insource:/twitter\.com\/[a-zA-Z0-9]+\/status\/[0-9]+\/?\?([st]|cxt|ref_[a-z]+)=/', site=site))
def treat_page(self) -> None:
if self.current_page.namespace() != 0:
return
bad_params = ['cxt', 'ref_src', 'ref_url', 's', 't']
pattern = r"(?<!\?url=|/|cache:)https?://(?:mobile\.)?twitter\.com/\w+/status/\d+\?[^\s}<|]+"
newtext = str(self.current_page.text)
matches = list(re.finditer(pattern, newtext))
changed = False
cnt = 0
for m in reversed(matches): # <- reversed iteration to ensure the indices work as intended
m: re.Match = m
# deleting parameters.
url = parse.urlparse(newtext[m.start():m.end()])
qs = parse.parse_qs(url.query)
for param in bad_params:
qs.pop(param, None)
url = url._replace(query=parse.urlencode(qs, doseq=True))
new_url = parse.urlunparse(url)
if new_url != m.string:
components = [newtext[:m.start()], newtext[m.end():]]
newtext = new_url.join(components)
cnt += 1
changed = True
waybackpat = r"https?://web\.archive\.org/web/([0-9]+)/(https?://(?:mobile\.)?twitter\.com/\w+/status/\d+(?:\?[^\s}<|]+)?)"
waybackcnt = 0
code = mwparserfromhell.parse(newtext)
for template in code.filter_templates():
template: mwparserfromhell.nodes.Template = template
if not template.name.matches("cite web") and not template.name.matches("cite tweet"):
continue
if not template.has("archive-url"):
continue
p: mwparserfromhell.nodes.template.Parameter = template.get('archive-url')
val = str(p.value).strip()
m = re.fullmatch(waybackpat, val)
if m is None:
continue
timestamp = m.group(1)
url = parse.urlparse(m.group(2))
qs = parse.parse_qs(url.query)
for param in bad_params:
qs.pop(param, None)
url = url._replace(query=parse.urlencode(qs, doseq=True))
new_url = parse.urlunparse(url)
if new_url == m.group(2):
continue
# get an archive that is within the target window of 1 year.
try:
memento = self.wayback.get_memento(new_url,
datetime=datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S'),
exact=False, target_window=365 * 24 * 3600)
except wayback.exceptions.WaybackException:
continue
if memento is None or not memento.ok:
continue
archive_url = memento.memento_url
timestamp = memento.timestamp
soup = bs4.BeautifulSoup(memento.text, features="html.parser")
if soup.title is None:
continue
print(soup.title.string)
if soup.title.string == 'Twitter': # these don't have a title and buggy
continue
api_pat = r"http://web\.archive\.org/web/([0-9]+)id_/(.+)"
m = re.fullmatch(api_pat, archive_url)
url = f'https://web.archive.org/web/{m.group(1)}/{m.group(2)}'
template.add("archive-url", url)
template.add("archive-date", timestamp.strftime("%Y-%m-%d"))
waybackcnt += 1
changed = True
newtext = code
if changed:
msg = "Removing Twitter tracker params ([[Wikipedia:Bots/Requests_for_approval/ScannerBot|BRFA]])"
cmt = ""
if cnt > 0:
cmt += f"{cnt} link(s) fixed"
if waybackcnt > 0:
if len(cmt) > 0:
cmt += ", "
cmt += f"{waybackcnt} archive link(s) changed"
summary = f"{msg} ({cmt})"
self.current_page.text = newtext
self.current_page.save(summary=summary, minor=True, botflag=True)
if __name__ == '__main__':
ScannerBot().run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment