Last active
December 29, 2022 17:10
-
-
Save fee1-dead/8428cd954b55d83043f94a1753e91a18 to your computer and use it in GitHub Desktop.
I hereby release the following source code under the Apache License 2.0 or under the MIT license, at your option.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# MIT or Apache-2 | |
import bs4 | |
import regex as re | |
import wayback | |
import mwparserfromhell | |
import pywikibot | |
from pywikibot.bot import (ExistingPageBot, SingleSiteBot) | |
from pywikibot.pagegenerators import SearchPageGenerator | |
from urllib import parse | |
import datetime | |
class ScannerBot(ExistingPageBot, SingleSiteBot): | |
update_options = { | |
'clean': False, | |
'create': False, | |
'merge': False, | |
} | |
def __init__(self) -> None: | |
site = pywikibot.Site(fam='wikipedia') | |
self.wayback = wayback.WaybackClient() | |
super().__init__(site=site, | |
generator=SearchPageGenerator( | |
r'insource:/twitter\.com\/[a-zA-Z0-9]+\/status\/[0-9]+\/?\?([st]|cxt|ref_[a-z]+)=/', site=site)) | |
def treat_page(self) -> None: | |
if self.current_page.namespace() != 0: | |
return | |
bad_params = ['cxt', 'ref_src', 'ref_url', 's', 't'] | |
pattern = r"(?<!\?url=|/|cache:)https?://(?:mobile\.)?twitter\.com/\w+/status/\d+\?[^\s}<|]+" | |
newtext = str(self.current_page.text) | |
matches = list(re.finditer(pattern, newtext)) | |
changed = False | |
cnt = 0 | |
for m in reversed(matches): # <- reversed iteration to ensure the indices work as intended | |
m: re.Match = m | |
# deleting parameters. | |
url = parse.urlparse(newtext[m.start():m.end()]) | |
qs = parse.parse_qs(url.query) | |
for param in bad_params: | |
qs.pop(param, None) | |
url = url._replace(query=parse.urlencode(qs, doseq=True)) | |
new_url = parse.urlunparse(url) | |
if new_url != m.string: | |
components = [newtext[:m.start()], newtext[m.end():]] | |
newtext = new_url.join(components) | |
cnt += 1 | |
changed = True | |
waybackpat = r"https?://web\.archive\.org/web/([0-9]+)/(https?://(?:mobile\.)?twitter\.com/\w+/status/\d+(?:\?[^\s}<|]+)?)" | |
waybackcnt = 0 | |
code = mwparserfromhell.parse(newtext) | |
for template in code.filter_templates(): | |
template: mwparserfromhell.nodes.Template = template | |
if not template.name.matches("cite web") and not template.name.matches("cite tweet"): | |
continue | |
if not template.has("archive-url"): | |
continue | |
p: mwparserfromhell.nodes.template.Parameter = template.get('archive-url') | |
val = str(p.value).strip() | |
m = re.fullmatch(waybackpat, val) | |
if m is None: | |
continue | |
timestamp = m.group(1) | |
url = parse.urlparse(m.group(2)) | |
qs = parse.parse_qs(url.query) | |
for param in bad_params: | |
qs.pop(param, None) | |
url = url._replace(query=parse.urlencode(qs, doseq=True)) | |
new_url = parse.urlunparse(url) | |
if new_url == m.group(2): | |
continue | |
# get an archive that is within the target window of 1 year. | |
try: | |
memento = self.wayback.get_memento(new_url, | |
datetime=datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S'), | |
exact=False, target_window=365 * 24 * 3600) | |
except wayback.exceptions.WaybackException: | |
continue | |
if memento is None or not memento.ok: | |
continue | |
archive_url = memento.memento_url | |
timestamp = memento.timestamp | |
soup = bs4.BeautifulSoup(memento.text, features="html.parser") | |
if soup.title is None: | |
continue | |
print(soup.title.string) | |
if soup.title.string == 'Twitter': # these don't have a title and buggy | |
continue | |
api_pat = r"http://web\.archive\.org/web/([0-9]+)id_/(.+)" | |
m = re.fullmatch(api_pat, archive_url) | |
url = f'https://web.archive.org/web/{m.group(1)}/{m.group(2)}' | |
template.add("archive-url", url) | |
template.add("archive-date", timestamp.strftime("%Y-%m-%d")) | |
waybackcnt += 1 | |
changed = True | |
newtext = code | |
if changed: | |
msg = "Removing Twitter tracker params ([[Wikipedia:Bots/Requests_for_approval/ScannerBot|BRFA]])" | |
cmt = "" | |
if cnt > 0: | |
cmt += f"{cnt} link(s) fixed" | |
if waybackcnt > 0: | |
if len(cmt) > 0: | |
cmt += ", " | |
cmt += f"{waybackcnt} archive link(s) changed" | |
summary = f"{msg} ({cmt})" | |
self.current_page.text = newtext | |
self.current_page.save(summary=summary, minor=True, botflag=True) | |
if __name__ == '__main__': | |
ScannerBot().run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment