Skip to content

Instantly share code, notes, and snippets.

@yfe404
Forked from halfak/features.py
Last active December 23, 2016 09:43
Show Gist options
  • Save yfe404/4378f5c0e5be902d93be522ddc247e6b to your computer and use it in GitHub Desktop.
Save yfe404/4378f5c0e5be902d93be522ddc247e6b to your computer and use it in GitHub Desktop.
from revscoring.features import wikitext
from revscoring.languages import english
char_based = [
wikitext.revision.chars,
wikitext.revision.whitespace_chars,
wikitext.revision.markup_chars,
wikitext.revision.cjk_chars,
wikitext.revision.entity_chars,
wikitext.revision.url_chars,
wikitext.revision.word_chars,
wikitext.revision.uppercase_word_chars,
wikitext.revision.punctuation_chars,
wikitext.revision.break_chars,
wikitext.revision.longest_repeated_char
]
token_based = [
wikitext.revision.tokens,
wikitext.revision.numbers,
wikitext.revision.whitespaces,
wikitext.revision.markups,
wikitext.revision.cjks,
wikitext.revision.entities,
wikitext.revision.urls,
wikitext.revision.words,
wikitext.revision.uppercase_words,
wikitext.revision.punctuations,
wikitext.revision.breaks,
wikitext.revision.longest_token,
wikitext.revision.longest_word
]
lang_based = [
english.badwords.revision.matches,
english.stopwords.revision.stopwords,
english.stopwords.revision.non_stopwords,
english.informals.revision.matches
]
parse_based = [
wikitext.revision.content_chars,
wikitext.revision.headings,
wikitext.revision.external_links,
wikitext.revision.wikilinks,
wikitext.revision.tags,
wikitext.revision.ref_tags,
wikitext.revision.templates
]
char_normalized = [
wikitext.revision.whitespace_chars / wikitext.revision.chars,
wikitext.revision.markup_chars / wikitext.revision.chars,
wikitext.revision.cjk_chars / wikitext.revision.chars,
wikitext.revision.entity_chars / wikitext.revision.chars,
wikitext.revision.url_chars / wikitext.revision.chars,
wikitext.revision.word_chars / wikitext.revision.chars,
wikitext.revision.uppercase_word_chars / wikitext.revision.chars,
wikitext.revision.punctuation_chars / wikitext.revision.chars,
wikitext.revision.break_chars / wikitext.revision.chars,
wikitext.revision.longest_repeated_char / wikitext.revision.chars,
]
token_normalized = [
wikitext.revision.numbers / wikitext.revision.tokens,
wikitext.revision.whitespaces / wikitext.revision.tokens,
wikitext.revision.markups / wikitext.revision.tokens,
wikitext.revision.cjks / wikitext.revision.tokens,
wikitext.revision.entities / wikitext.revision.tokens,
wikitext.revision.urls / wikitext.revision.tokens,
wikitext.revision.words / wikitext.revision.tokens,
wikitext.revision.uppercase_words / wikitext.revision.tokens,
wikitext.revision.punctuations / wikitext.revision.tokens,
wikitext.revision.breaks / wikitext.revision.tokens,
wikitext.revision.longest_token / wikitext.revision.tokens,
wikitext.revision.longest_word / wikitext.revision.tokens
]
lang_normalized = [
english.badwords.revision.matches / wikitext.revision.words,
english.stopwords.revision.stopwords / wikitext.revision.words,
english.stopwords.revision.non_stopwords / wikitext.revision.words,
english.informals.revision.matches / wikitext.revision.words
]
draft_quality = (char_based + token_based + parse_based +
lang_based + char_normalized + token_normalized + lang_normalized)
>>> from pprint import pprint
>>>
>>> from revscoring.datasources import revision_oriented as ro
>>> from revscoring.dependencies import solve
>>>
>>>
>>> text = """
... {{Infobox|foo}}
... I am an article
...
... == Header! ==
... * list
... * items
... * [[stuff|and a link]]<ref>some stuff</ref>
... """
>>> pprint(list(zip(draft_quality, solve(draft_quality, cache={ro.revision.text: text}))))
[(<feature.wikitext.revision.chars>, 139),
(<feature.wikitext.revision.whitespace_chars>, 27),
(<feature.wikitext.revision.markup_chars>, 12),
(<feature.wikitext.revision.cjk_chars>, 0),
(<feature.wikitext.revision.entity_chars>, 0),
(<feature.wikitext.revision.url_chars>, 0),
(<feature.wikitext.revision.word_chars>, 59),
(<feature.wikitext.revision.uppercase_word_chars>, 0),
(<feature.wikitext.revision.punctuation_chars>, 25),
(<feature.wikitext.revision.break_chars>, 0),
(<feature.wikitext.revision.longest_repeated_char>, 3),
(<feature.len(<datasource.tokenized(datasource.revision.text)>)>, 64),
(<feature.len(<datasource.wikitext.revision.numbers>)>, 0),
(<feature.len(<datasource.wikitext.revision.whitespaces>)>, 27),
(<feature.len(<datasource.wikitext.revision.markups>)>, 6),
(<feature.len(<datasource.wikitext.revision.cjks>)>, 0),
(<feature.len(<datasource.wikitext.revision.entities>)>, 0),
(<feature.len(<datasource.wikitext.revision.urls>)>, 0),
(<feature.len(<datasource.wikitext.revision.words>)>, 15),
(<feature.len(<datasource.wikitext.revision.uppercase_words>)>, 0),
(<feature.len(<datasource.wikitext.revision.punctuations>)>, 9),
(<feature.len(<datasource.wikitext.revision.breaks>)>, 0),
(<feature.max(<datasource.map(<built-in function len>, <datasource.tokenized(datasource.revision.text)>)>)>,
7),
(<feature.max(<datasource.map(<built-in function len>, <datasource.wikitext.revision.words>)>)>,
7),
(<feature.wikitext.revision.content_chars>, 102),
(<feature.wikitext.revision.headings>, 0),
(<feature.wikitext.revision.external_links>, 0),
(<feature.wikitext.revision.wikilinks>, 1),
(<feature.wikitext.revision.tags>, 1),
(<feature.wikitext.revision.ref_tags>, 1),
(<feature.wikitext.revision.templates>, 1),
(<feature.len(<datasource.english.badwords.revision.matches>)>, 0),
(<feature.len(<datasource.english.stopwords.revision.stopwords>)>, 6),
(<feature.len(<datasource.english.stopwords.revision.non_stopwords>)>, 9),
(<feature.len(<datasource.english.informals.revision.matches>)>, 1),
(<feature.(wikitext.revision.whitespace_chars / wikitext.revision.chars)>,
0.19424460431654678),
(<feature.(wikitext.revision.markup_chars / wikitext.revision.chars)>,
0.08633093525179857),
(<feature.(wikitext.revision.cjk_chars / wikitext.revision.chars)>, 0.0),
(<feature.(wikitext.revision.entity_chars / wikitext.revision.chars)>, 0.0),
(<feature.(wikitext.revision.url_chars / wikitext.revision.chars)>, 0.0),
(<feature.(wikitext.revision.word_chars / wikitext.revision.chars)>,
0.4244604316546763),
(<feature.(wikitext.revision.uppercase_word_chars / wikitext.revision.chars)>,
0.0),
(<feature.(wikitext.revision.punctuation_chars / wikitext.revision.chars)>,
0.17985611510791366),
(<feature.(wikitext.revision.break_chars / wikitext.revision.chars)>, 0.0),
(<feature.(wikitext.revision.longest_repeated_char / wikitext.revision.chars)>,
0.02158273381294964),
(<feature.(len(<datasource.wikitext.revision.numbers>) / len(<datasource.tokenized(datasource.revision.text)>))>,
0.0),
(<feature.(len(<datasource.wikitext.revision.whitespaces>) / len(<datasource.tokenized(datasource.revision.text)>))>,
0.421875),
(<feature.(len(<datasource.wikitext.revision.markups>) / len(<datasource.tokenized(datasource.revision.text)>))>,
0.09375),
(<feature.(len(<datasource.wikitext.revision.cjks>) / len(<datasource.tokenized(datasource.revision.text)>))>,
0.0),
(<feature.(len(<datasource.wikitext.revision.entities>) / len(<datasource.tokenized(datasource.revision.text)>))>,
0.0),
(<feature.(len(<datasource.wikitext.revision.urls>) / len(<datasource.tokenized(datasource.revision.text)>))>,
0.0),
(<feature.(len(<datasource.wikitext.revision.words>) / len(<datasource.tokenized(datasource.revision.text)>))>,
0.234375),
(<feature.(len(<datasource.wikitext.revision.uppercase_words>) / len(<datasource.tokenized(datasource.revision.text)>))>,
0.0),
(<feature.(len(<datasource.wikitext.revision.punctuations>) / len(<datasource.tokenized(datasource.revision.text)>))>,
0.140625),
(<feature.(len(<datasource.wikitext.revision.breaks>) / len(<datasource.tokenized(datasource.revision.text)>))>,
0.0),
(<feature.(max(<datasource.map(<built-in function len>, <datasource.tokenized(datasource.revision.text)>)>) / len(<datasource.tokenized(datasource.revision.text)>))>,
0.109375),
(<feature.(max(<datasource.map(<built-in function len>, <datasource.wikitext.revision.words>)>) / len(<datasource.tokenized(datasource.revision.text)>))>,
0.109375),
(<feature.(len(<datasource.english.badwords.revision.matches>) / len(<datasource.wikitext.revision.words>))>,
0.0),
(<feature.(len(<datasource.english.stopwords.revision.stopwords>) / len(<datasource.wikitext.revision.words>))>,
0.4),
(<feature.(len(<datasource.english.stopwords.revision.non_stopwords>) / len(<datasource.wikitext.revision.words>))>,
0.6),
(<feature.(len(<datasource.english.informals.revision.matches>) / len(<datasource.wikitext.revision.words>))>,
0.06666666666666667)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment