Last active
July 6, 2017 19:30
-
-
Save halfak/b31b8ddc38ca701c4c964478a53da75f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> import re | |
>>> import time | |
>>> import mwapi | |
>>> from ores.util import timeout | |
>>> | |
>>> session = mwapi.Session("https://es.wikipedia.org") | |
Sending requests with default User-Agent. Set 'user_agent' on mwapi.Session to quiet this message. | |
>>> doc = session.get(action='query', prop='revisions', revids=100032572, rvprop='content', formatversion=2) | |
>>> text = doc['query']['pages'][0]['revisions'][0]['content'] | |
>>> print(text[0:100]) | |
{MARICAAAAA CASAMELAAAA FUMA PORRO | |
JAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJA | |
>>> | |
>>> bad_re = re.compile(r"j+[eaiou]+(j+[aeiou]*)*", re.I + re.M) | |
>>> | |
>>> start = time.time() | |
>>> list(bad_re.finditer(text)) | |
[<_sre.SRE_Match object; span=(35, 2147), match='JAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJA>, <_sre.SRE_Match object; span=(2152, 2553), match='JAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJA>, <_sre.SRE_Match object; span=(2557, 3668), match='JAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJ>, <_sre.SRE_Match object; span=(3677, 4190), match='JAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJA>, <_sre.SRE_Match object; span=(4708, 4710), match='ji'>, <_sre.SRE_Match object; span=(5349, 5351), match='ju'>, <_sre.SRE_Match object; span=(5830, 5832), match='ju'>, <_sre.SRE_Match object; span=(5874, 5876), match='ju'>, <_sre.SRE_Match object; span=(8518, 8520), match='je'>, <_sre.SRE_Match object; span=(8670, 8672), match='ju'>, <_sre.SRE_Match object; span=(8710, 8712), match='ju'>, <_sre.SRE_Match object; span=(9175, 9177), match='je'>, <_sre.SRE_Match object; span=(9331, 9333), match='jo'>, <_sre.SRE_Match object; span=(10653, 10655), match='jo'>, <_sre.SRE_Match object; span=(11561, 11563), match='ja'>, <_sre.SRE_Match object; span=(12057, 12059), match='jo'>, <_sre.SRE_Match object; span=(12891, 12893), match='ja'>, <_sre.SRE_Match object; span=(13190, 13192), match='ja'>, <_sre.SRE_Match object; span=(13351, 13353), match='je'>, <_sre.SRE_Match object; span=(13446, 13448), match='ju'>, <_sre.SRE_Match object; span=(14357, 14359), match='ju'>, <_sre.SRE_Match object; span=(14932, 14934), match='jo'>, <_sre.SRE_Match object; span=(15041, 15043), match='jo'>, <_sre.SRE_Match object; span=(16713, 16715), match='jo'>, <_sre.SRE_Match object; span=(16860, 16862), match='jo'>, <_sre.SRE_Match object; span=(16987, 16989), match='ja'>, <_sre.SRE_Match object; span=(17355, 17357), match='je'>, <_sre.SRE_Match object; span=(18567, 18570), match='Jia'>, <_sre.SRE_Match object; span=(18705, 18707), match='Ja'>, <_sre.SRE_Match object; span=(19251, 19253), match='ju'>, <_sre.SRE_Match object; span=(20373, 20375), match='ja'>] | |
>>> print("ja's matching:", time.time() - start) | |
ja's matching: 0.00445246696472168 | |
>>> | |
>>> print("done") | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ python | |
Python 3.5.1+ (default, Mar 30 2016, 22:46:26) | |
[GCC 5.3.1 20160330] on linux | |
Type "help", "copyright", "credits" or "license" for more information. | |
>>> import re | |
>>> import time | |
>>> import mwapi | |
>>> from ores.util import timeout | |
>>> | |
>>> session = mwapi.Session("https://es.wikipedia.org") | |
Sending requests with default User-Agent. Set 'user_agent' on mwapi.Session to quiet this message. | |
>>> doc = session.get(action='query', prop='revisions', revids=100032572, rvprop='content', formatversion=2) | |
>>> text = doc['query']['pages'][0]['revisions'][0]['content'] | |
>>> print(text[0:100]) | |
{MARICAAAAA CASAMELAAAA FUMA PORRO | |
JAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJAJAJAJAJJAJAJJAJAJA | |
>>> | |
>>> bad_re = re.compile('(\\b)(agan|agregenme|aguante|aki|amaona|amigui|amo+|apesta|asco|asi|atte?|(?:b+l+a+h*)+|bobada|bobos?|bubis|chafa|chale|chido|chí|chil[ei]ar|comi[ao]|copien|cursiva|esq|est[uú]pid[aeo]+[rs]?|fulan[ao]|fe[ao]s?|gra(x|sias)|guapo|h?o+la+|ho+lis?|ijos|inserta|j+[eaiou]+(j+[aeiou]*)*|ke|kie(n|ro)|komo|lean|lees|lo[ck]os?|(l+[uo]+l+)([uo]+l+)*|madrazo|malparida|mcfinnigan|mensos?|meti[oa]|metroflog|migu?is|muxo|negrit[ao]|nocheto|noo+|nop|ojala|o+l[ia]|osea|pollid|popo|pipi|plis|por[- ]?favor|por[- ]?[kq]ue?|porke|porqe?|porquer[ií]as?|profe|pupu|qiero|salud(o?s)?|sierto|shí|sii+|soi|sophonpanich|ta?mbn|tkm|tanga|te[- ]?quiero[- ]?mucho|tqm|umaxnet|vallanse|vayanse|wen[ao]|weon(es)?|wey|xd+|xfarm|yolo|zorpia)(\\b)', re.I) | |
>>> | |
>>> def apply_bad_re(text): | |
... return list(bad_re.finditer(text)) | |
... | |
>>> timeout(apply_bad_re, text, seconds=15) | |
^CTraceback (most recent call last): | |
File "/home/halfak/venv/3.5/lib/python3.5/site-packages/ores-0.9.1-py3.5.egg/ores/util.py", line 26, in timeout | |
result = func(*args, **kwargs) | |
File "<stdin>", line 2, in apply_bad_re | |
KeyboardInterrupt | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/home/halfak/venv/3.5/lib/python3.5/site-packages/ores-0.9.1-py3.5.egg/ores/util.py", line 26, in timeout | |
result = func(*args, **kwargs) | |
stopit.utils.TimeoutException | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "<stdin>", line 1, in <module> | |
File "/home/halfak/venv/3.5/lib/python3.5/site-packages/ores-0.9.1-py3.5.egg/ores/util.py", line 41, in timeout | |
raise TimeoutError("Timed out after {0} seconds.".format(e.seconds)) | |
AttributeError: 'TimeoutException' object has no attribute 'seconds' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment