Skip to content

Instantly share code, notes, and snippets.

@halfak
Last active July 6, 2017 19:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save halfak/b31b8ddc38ca701c4c964478a53da75f to your computer and use it in GitHub Desktop.
Save halfak/b31b8ddc38ca701c4c964478a53da75f to your computer and use it in GitHub Desktop.
>>> import re
>>> import time
>>> import mwapi
>>> from ores.util import timeout
>>>
>>> session = mwapi.Session("https://es.wikipedia.org")
Sending requests with default User-Agent. Set 'user_agent' on mwapi.Session to quiet this message.
>>> doc = session.get(action='query', prop='revisions', revids=100032572, rvprop='content', formatversion=2)
>>> text = doc['query']['pages'][0]['revisions'][0]['content']
>>> print(text[0:100])
{MARICAAAAA CASAMELAAAA FUMA PORRO
JAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJA
>>>
>>> bad_re = re.compile(r"j+[eaiou]+(j+[aeiou]*)*", re.I + re.M)
>>>
>>> start = time.time()
>>> list(bad_re.finditer(text))
[<_sre.SRE_Match object; span=(35, 2147), match='JAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJA>, <_sre.SRE_Match object; span=(2152, 2553), match='JAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJA>, <_sre.SRE_Match object; span=(2557, 3668), match='JAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJ>, <_sre.SRE_Match object; span=(3677, 4190), match='JAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJA>, <_sre.SRE_Match object; span=(4708, 4710), match='ji'>, <_sre.SRE_Match object; span=(5349, 5351), match='ju'>, <_sre.SRE_Match object; span=(5830, 5832), match='ju'>, <_sre.SRE_Match object; span=(5874, 5876), match='ju'>, <_sre.SRE_Match object; span=(8518, 8520), match='je'>, <_sre.SRE_Match object; span=(8670, 8672), match='ju'>, <_sre.SRE_Match object; span=(8710, 8712), match='ju'>, <_sre.SRE_Match object; span=(9175, 9177), match='je'>, <_sre.SRE_Match object; span=(9331, 9333), match='jo'>, <_sre.SRE_Match object; span=(10653, 10655), match='jo'>, <_sre.SRE_Match object; span=(11561, 11563), match='ja'>, <_sre.SRE_Match object; span=(12057, 12059), match='jo'>, <_sre.SRE_Match object; span=(12891, 12893), match='ja'>, <_sre.SRE_Match object; span=(13190, 13192), match='ja'>, <_sre.SRE_Match object; span=(13351, 13353), match='je'>, <_sre.SRE_Match object; span=(13446, 13448), match='ju'>, <_sre.SRE_Match object; span=(14357, 14359), match='ju'>, <_sre.SRE_Match object; span=(14932, 14934), match='jo'>, <_sre.SRE_Match object; span=(15041, 15043), match='jo'>, <_sre.SRE_Match object; span=(16713, 16715), match='jo'>, <_sre.SRE_Match object; span=(16860, 16862), match='jo'>, <_sre.SRE_Match object; span=(16987, 16989), match='ja'>, <_sre.SRE_Match object; span=(17355, 17357), match='je'>, <_sre.SRE_Match object; span=(18567, 18570), match='Jia'>, <_sre.SRE_Match object; span=(18705, 18707), match='Ja'>, <_sre.SRE_Match object; span=(19251, 19253), match='ju'>, <_sre.SRE_Match object; span=(20373, 20375), match='ja'>]
>>> print("ja's matching:", time.time() - start)
ja's matching: 0.00445246696472168
>>>
>>> print("done")
done
$ python
Python 3.5.1+ (default, Mar 30 2016, 22:46:26)
[GCC 5.3.1 20160330] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import re
>>> import time
>>> import mwapi
>>> from ores.util import timeout
>>>
>>> session = mwapi.Session("https://es.wikipedia.org")
Sending requests with default User-Agent. Set 'user_agent' on mwapi.Session to quiet this message.
>>> doc = session.get(action='query', prop='revisions', revids=100032572, rvprop='content', formatversion=2)
>>> text = doc['query']['pages'][0]['revisions'][0]['content']
>>> print(text[0:100])
{MARICAAAAA CASAMELAAAA FUMA PORRO
JAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJA
>>>
>>> bad_re = re.compile('(\\b)(agan|agregenme|aguante|aki|amaona|amigui|amo+|apesta|asco|asi|atte?|(?:b+l+a+h*)+|bobada|bobos?|bubis|chafa|chale|chido|chí|chil[ei]ar|comi[ao]|copien|cursiva|esq|est[uú]pid[aeo]+[rs]?|fulan[ao]|fe[ao]s?|gra(x|sias)|guapo|h?o+la+|ho+lis?|ijos|inserta|j+[eaiou]+(j+[aeiou]*)*|ke|kie(n|ro)|komo|lean|lees|lo[ck]os?|(l+[uo]+l+)([uo]+l+)*|madrazo|malparida|mcfinnigan|mensos?|meti[oa]|metroflog|migu?is|muxo|negrit[ao]|nocheto|noo+|nop|ojala|o+l[ia]|osea|pollid|popo|pipi|plis|por[- ]?favor|por[- ]?[kq]ue?|porke|porqe?|porquer[ií]as?|profe|pupu|qiero|salud(o?s)?|sierto|shí|sii+|soi|sophonpanich|ta?mbn|tkm|tanga|te[- ]?quiero[- ]?mucho|tqm|umaxnet|vallanse|vayanse|wen[ao]|weon(es)?|wey|xd+|xfarm|yolo|zorpia)(\\b)', re.I)
>>>
>>> def apply_bad_re(text):
... return list(bad_re.finditer(text))
...
>>> timeout(apply_bad_re, text, seconds=15)
^CTraceback (most recent call last):
File "/home/halfak/venv/3.5/lib/python3.5/site-packages/ores-0.9.1-py3.5.egg/ores/util.py", line 26, in timeout
result = func(*args, **kwargs)
File "<stdin>", line 2, in apply_bad_re
KeyboardInterrupt
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/halfak/venv/3.5/lib/python3.5/site-packages/ores-0.9.1-py3.5.egg/ores/util.py", line 26, in timeout
result = func(*args, **kwargs)
stopit.utils.TimeoutException
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/halfak/venv/3.5/lib/python3.5/site-packages/ores-0.9.1-py3.5.egg/ores/util.py", line 41, in timeout
raise TimeoutError("Timed out after {0} seconds.".format(e.seconds))
AttributeError: 'TimeoutException' object has no attribute 'seconds'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment