Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
>>> import re
>>> import time
>>> import mwapi
>>> from ores.util import timeout
>>>
>>> session = mwapi.Session("https://es.wikipedia.org")
Sending requests with default User-Agent. Set 'user_agent' on mwapi.Session to quiet this message.
>>> doc = session.get(action='query', prop='revisions', revids=100032572, rvprop='content', formatversion=2)
>>> text = doc['query']['pages'][0]['revisions'][0]['content']
>>> print(text[0:100])
{MARICAAAAA CASAMELAAAA FUMA PORRO
JAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJA
>>>
>>> bad_re = re.compile(r"j+[eaiou]+(j+[aeiou]*)*", re.I + re.M)
>>>
>>> start = time.time()
>>> list(bad_re.finditer(text))
[<_sre.SRE_Match object; span=(35, 2147), match='JAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJA>, <_sre.SRE_Match object; span=(2152, 2553), match='JAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJA>, <_sre.SRE_Match object; span=(2557, 3668), match='JAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJ>, <_sre.SRE_Match object; span=(3677, 4190), match='JAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJA>, <_sre.SRE_Match object; span=(4708, 4710), match='ji'>, <_sre.SRE_Match object; span=(5349, 5351), match='ju'>, <_sre.SRE_Match object; span=(5830, 5832), match='ju'>, <_sre.SRE_Match object; span=(5874, 5876), match='ju'>, <_sre.SRE_Match object; span=(8518, 8520), match='je'>, <_sre.SRE_Match object; span=(8670, 8672), match='ju'>, <_sre.SRE_Match object; span=(8710, 8712), match='ju'>, <_sre.SRE_Match object; span=(9175, 9177), match='je'>, <_sre.SRE_Match object; span=(9331, 9333), match='jo'>, <_sre.SRE_Match object; span=(10653, 10655), match='jo'>, <_sre.SRE_Match object; span=(11561, 11563), match='ja'>, <_sre.SRE_Match object; span=(12057, 12059), match='jo'>, <_sre.SRE_Match object; span=(12891, 12893), match='ja'>, <_sre.SRE_Match object; span=(13190, 13192), match='ja'>, <_sre.SRE_Match object; span=(13351, 13353), match='je'>, <_sre.SRE_Match object; span=(13446, 13448), match='ju'>, <_sre.SRE_Match object; span=(14357, 14359), match='ju'>, <_sre.SRE_Match object; span=(14932, 14934), match='jo'>, <_sre.SRE_Match object; span=(15041, 15043), match='jo'>, <_sre.SRE_Match object; span=(16713, 16715), match='jo'>, <_sre.SRE_Match object; span=(16860, 16862), match='jo'>, <_sre.SRE_Match object; span=(16987, 16989), match='ja'>, <_sre.SRE_Match object; span=(17355, 17357), match='je'>, <_sre.SRE_Match object; span=(18567, 18570), match='Jia'>, <_sre.SRE_Match object; span=(18705, 18707), match='Ja'>, <_sre.SRE_Match object; span=(19251, 19253), match='ju'>, <_sre.SRE_Match object; span=(20373, 20375), match='ja'>]
>>> print("ja's matching:", time.time() - start)
ja's matching: 0.00445246696472168
>>>
>>> print("done")
done
$ python
Python 3.5.1+ (default, Mar 30 2016, 22:46:26)
[GCC 5.3.1 20160330] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import re
>>> import time
>>> import mwapi
>>> from ores.util import timeout
>>>
>>> session = mwapi.Session("https://es.wikipedia.org")
Sending requests with default User-Agent. Set 'user_agent' on mwapi.Session to quiet this message.
>>> doc = session.get(action='query', prop='revisions', revids=100032572, rvprop='content', formatversion=2)
>>> text = doc['query']['pages'][0]['revisions'][0]['content']
>>> print(text[0:100])
{MARICAAAAA CASAMELAAAA FUMA PORRO
JAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJA
>>>
>>> bad_re = re.compile('(\\b)(agan|agregenme|aguante|aki|amaona|amigui|amo+|apesta|asco|asi|atte?|(?:b+l+a+h*)+|bobada|bobos?|bubis|chafa|chale|chido|chí|chil[ei]ar|comi[ao]|copien|cursiva|esq|est[uú]pid[aeo]+[rs]?|fulan[ao]|fe[ao]s?|gra(x|sias)|guapo|h?o+la+|ho+lis?|ijos|inserta|j+[eaiou]+(j+[aeiou]*)*|ke|kie(n|ro)|komo|lean|lees|lo[ck]os?|(l+[uo]+l+)([uo]+l+)*|madrazo|malparida|mcfinnigan|mensos?|meti[oa]|metroflog|migu?is|muxo|negrit[ao]|nocheto|noo+|nop|ojala|o+l[ia]|osea|pollid|popo|pipi|plis|por[- ]?favor|por[- ]?[kq]ue?|porke|porqe?|porquer[ií]as?|profe|pupu|qiero|salud(o?s)?|sierto|shí|sii+|soi|sophonpanich|ta?mbn|tkm|tanga|te[- ]?quiero[- ]?mucho|tqm|umaxnet|vallanse|vayanse|wen[ao]|weon(es)?|wey|xd+|xfarm|yolo|zorpia)(\\b)', re.I)
>>>
>>> def apply_bad_re(text):
... return list(bad_re.finditer(text))
...
>>> timeout(apply_bad_re, text, seconds=15)
^CTraceback (most recent call last):
File "/home/halfak/venv/3.5/lib/python3.5/site-packages/ores-0.9.1-py3.5.egg/ores/util.py", line 26, in timeout
result = func(*args, **kwargs)
File "<stdin>", line 2, in apply_bad_re
KeyboardInterrupt
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/halfak/venv/3.5/lib/python3.5/site-packages/ores-0.9.1-py3.5.egg/ores/util.py", line 26, in timeout
result = func(*args, **kwargs)
stopit.utils.TimeoutException
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/halfak/venv/3.5/lib/python3.5/site-packages/ores-0.9.1-py3.5.egg/ores/util.py", line 41, in timeout
raise TimeoutError("Timed out after {0} seconds.".format(e.seconds))
AttributeError: 'TimeoutException' object has no attribute 'seconds'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.