Skip to content

Instantly share code, notes, and snippets.

@halfak
Last active December 20, 2016 07:31
Show Gist options
  • Save halfak/a19f4d23df26151be2a57a42a6451caf to your computer and use it in GitHub Desktop.
Save halfak/a19f4d23df26151be2a57a42a6451caf to your computer and use it in GitHub Desktop.
from revscoring.features import wikitext
char_based = [
wikitext.revision.chars,
wikitext.revision.whitespace_chars,
wikitext.revision.markup_chars,
wikitext.revision.cjk_chars,
wikitext.revision.entity_chars,
wikitext.revision.url_chars,
wikitext.revision.word_chars,
wikitext.revision.uppercase_word_chars,
wikitext.revision.punctuation_chars,
wikitext.revision.break_chars,
wikitext.revision.longest_repeated_char
]
token_based = [
wikitext.revision.tokens,
wikitext.revision.numbers,
wikitext.revision.whitespaces,
wikitext.revision.markups,
wikitext.revision.cjks,
wikitext.revision.entities,
wikitext.revision.urls,
wikitext.revision.words,
wikitext.revision.uppercase_words,
wikitext.revision.punctuations,
wikitext.revision.breaks,
wikitext.revision.longest_token,
wikitext.revision.longest_word
]
parse_based = [
wikitext.revision.content_chars,
wikitext.revision.headings,
wikitext.revision.external_links,
wikitext.revision.wikilinks,
wikitext.revision.tags,
wikitext.revision.ref_tags,
wikitext.revision.templates
]
char_normalized = [
wikitext.revision.whitespace_chars / wikitext.revision.chars,
wikitext.revision.markup_chars / wikitext.revision.chars,
wikitext.revision.cjk_chars / wikitext.revision.chars,
wikitext.revision.entity_chars / wikitext.revision.chars,
wikitext.revision.url_chars / wikitext.revision.chars,
wikitext.revision.word_chars / wikitext.revision.chars,
wikitext.revision.uppercase_word_chars / wikitext.revision.chars,
wikitext.revision.punctuation_chars / wikitext.revision.chars,
wikitext.revision.break_chars / wikitext.revision.chars,
wikitext.revision.longest_repeated_char / wikitext.revision.chars,
]
token_normalized = [
wikitext.revision.numbers / wikitext.revision.tokens,
wikitext.revision.whitespaces / wikitext.revision.tokens,
wikitext.revision.markups / wikitext.revision.tokens,
wikitext.revision.cjks / wikitext.revision.tokens,
wikitext.revision.entities / wikitext.revision.tokens,
wikitext.revision.urls / wikitext.revision.tokens,
wikitext.revision.words / wikitext.revision.tokens,
wikitext.revision.uppercase_words / wikitext.revision.tokens,
wikitext.revision.punctuations / wikitext.revision.tokens,
wikitext.revision.breaks / wikitext.revision.tokens,
wikitext.revision.longest_token / wikitext.revision.tokens,
wikitext.revision.longest_word / wikitext.revision.tokens
]
draft_quality = (char_based + token_based + parse_based +
char_normalized + token_normalized)
>>> from pprint import pprint
>>>
>>> from revscoring.datasources import revision_oriented as ro
>>> from revscoring.dependencies import solve
>>>
>>>
>>> text = """
... {{Infobox|foo}}
... I am an article
...
... == Header! ==
... * list
... * items
... * [[stuff|and a link]]<ref>some stuff</ref>
... """
>>> pprint(list(zip(draft_quality, solve(draft_quality, cache={ro.revision.text: text}))))
[(<feature.wikitext.revision.chars>, 107),
(<feature.wikitext.revision.whitespace_chars>, 17),
(<feature.wikitext.revision.markup_chars>, 12),
(<feature.wikitext.revision.cjk_chars>, 0),
(<feature.wikitext.revision.entity_chars>, 0),
(<feature.wikitext.revision.url_chars>, 0),
(<feature.wikitext.revision.word_chars>, 59),
(<feature.wikitext.revision.uppercase_word_chars>, 0),
(<feature.wikitext.revision.punctuation_chars>, 1),
(<feature.wikitext.revision.break_chars>, 2),
(<feature.wikitext.revision.longest_repeated_char>, 2),
(<feature.len(<datasource.tokenized(datasource.revision.text)>)>, 47),
(<feature.len(<datasource.wikitext.revision.numbers>)>, 0),
(<feature.len(<datasource.wikitext.revision.whitespaces>)>, 17),
(<feature.len(<datasource.wikitext.revision.markups>)>, 6),
(<feature.len(<datasource.wikitext.revision.cjks>)>, 0),
(<feature.len(<datasource.wikitext.revision.entities>)>, 0),
(<feature.len(<datasource.wikitext.revision.urls>)>, 0),
(<feature.len(<datasource.wikitext.revision.words>)>, 15),
(<feature.len(<datasource.wikitext.revision.uppercase_words>)>, 0),
(<feature.len(<datasource.wikitext.revision.punctuations>)>, 1),
(<feature.len(<datasource.wikitext.revision.breaks>)>, 1),
(<feature.max(<datasource.map(<built-in function len>, <datasource.tokenized(datasource.revision.text)>)>)>, 7),
(<feature.max(<datasource.map(<built-in function len>, <datasource.wikitext.revision.words>)>)>, 7),
(<feature.wikitext.revision.content_chars>, 61),
(<feature.wikitext.revision.headings>, 1),
(<feature.wikitext.revision.external_links>, 0),
(<feature.wikitext.revision.wikilinks>, 1),
(<feature.wikitext.revision.tags>, 4),
(<feature.wikitext.revision.ref_tags>, 1),
(<feature.wikitext.revision.templates>, 1),
(<feature.(wikitext.revision.whitespace_chars / wikitext.revision.chars)>, 0.1588785046728972),
(<feature.(wikitext.revision.markup_chars / wikitext.revision.chars)>, 0.11214953271028037),
(<feature.(wikitext.revision.cjk_chars / wikitext.revision.chars)>, 0.0),
(<feature.(wikitext.revision.entity_chars / wikitext.revision.chars)>, 0.0),
(<feature.(wikitext.revision.url_chars / wikitext.revision.chars)>, 0.0),
(<feature.(wikitext.revision.word_chars / wikitext.revision.chars)>, 0.5514018691588785),
(<feature.(wikitext.revision.uppercase_word_chars / wikitext.revision.chars)>, 0.0),
(<feature.(wikitext.revision.punctuation_chars / wikitext.revision.chars)>, 0.009345794392523364),
(<feature.(wikitext.revision.break_chars / wikitext.revision.chars)>, 0.018691588785046728),
(<feature.(wikitext.revision.longest_repeated_char / wikitext.revision.chars)>, 0.018691588785046728),
(<feature.(len(<datasource.wikitext.revision.numbers>) / len(<datasource.tokenized(datasource.revision.text)>))>, 0.0),
(<feature.(len(<datasource.wikitext.revision.whitespaces>) / len(<datasource.tokenized(datasource.revision.text)>))>, 0.3617021276595745),
(<feature.(len(<datasource.wikitext.revision.markups>) / len(<datasource.tokenized(datasource.revision.text)>))>, 0.1276595744680851),
(<feature.(len(<datasource.wikitext.revision.cjks>) / len(<datasource.tokenized(datasource.revision.text)>))>, 0.0),
(<feature.(len(<datasource.wikitext.revision.entities>) / len(<datasource.tokenized(datasource.revision.text)>))>, 0.0),
(<feature.(len(<datasource.wikitext.revision.urls>) / len(<datasource.tokenized(datasource.revision.text)>))>, 0.0),
(<feature.(len(<datasource.wikitext.revision.words>) / len(<datasource.tokenized(datasource.revision.text)>))>, 0.3191489361702128),
(<feature.(len(<datasource.wikitext.revision.uppercase_words>) / len(<datasource.tokenized(datasource.revision.text)>))>, 0.0),
(<feature.(len(<datasource.wikitext.revision.punctuations>) / len(<datasource.tokenized(datasource.revision.text)>))>, 0.02127659574468085),
(<feature.(len(<datasource.wikitext.revision.breaks>) / len(<datasource.tokenized(datasource.revision.text)>))>, 0.02127659574468085),
(<feature.(max(<datasource.map(<built-in function len>, <datasource.tokenized(datasource.revision.text)>)>) / len(<datasource.tokenized(datasource.revision.text)>))>, 0.14893617021276595),
(<feature.(max(<datasource.map(<built-in function len>, <datasource.wikitext.revision.words>)>) / len(<datasource.tokenized(datasource.revision.text)>))>, 0.14893617021276595)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment