ixaxaar/numberbatch.md

## numberbatch.md

      
    Raw
  

              numberbatch.md
            
          
download model

if [[ ! -e 'numberbatch-17.06.txt' ]]; then
    wget https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz
    gunzip numberbatch-17.06.txt.gz
fi
sudo pip install wordfreq
sudo pip install gensim


Load model

numberbatch = gensim.models.word2vec.Word2Vec.load_word2vec_format(
    '/home/ixaxaar/src/neural/msmarco/numberbatch-17.06.txt', 
    binary=False, 
    unicode_errors='ignore'
)

Code to convert to conceptnet style, e.g. word -> /c/en/word

import wordfreq
import re

# English-specific stopword handling
STOPWORDS = ['the', 'a', 'an']
DROP_FIRST = ['to']
DOUBLE_DIGIT_RE = re.compile(r'[0-9][0-9]')
DIGIT_RE = re.compile(r'[0-9]')

def standardized_uri(language, term):
    """
    Get a URI that is suitable to label a row of a vector space, by making sure
    that both ConceptNet's and word2vec's normalizations are applied to it.

    'language' should be a BCP 47 language code, such as 'en' for English.

    If the term already looks like a ConceptNet URI, it will only have its
    sequences of digits replaced by #. Otherwise, it will be turned into a
    ConceptNet URI in the given language, and then have its sequences of digits
    replaced.
    """
    if not (term.startswith('/') and term.count('/') >= 2):
        term = _standardized_concept_uri(language, term)
    return replace_numbers(term)

def english_filter(tokens):
    """
    Given a list of tokens, remove a small list of English stopwords. This
    helps to work with previous versions of ConceptNet, which often provided
    phrases such as 'an apple' and assumed they would be standardized to
	'apple'.
    """
    non_stopwords = [token for token in tokens if token not in STOPWORDS]
    while non_stopwords and non_stopwords[0] in DROP_FIRST:
        non_stopwords = non_stopwords[1:]
    if non_stopwords:
        return non_stopwords
    else:
        return tokens

def replace_numbers(s):
    """
    Replace digits with # in any term where a sequence of two digits appears.

    This operation is applied to text that passes through word2vec, so we
    should match it.
    """
    if DOUBLE_DIGIT_RE.search(s):
        return DIGIT_RE.sub('#', s)
    else:
        return s

def _standardized_concept_uri(language, term):
    if language == 'en':
        token_filter = english_filter
    else:
        token_filter = None
    language = language.lower()
    norm_text = _standardized_text(term, token_filter)
    return '/c/{}/{}'.format(language, norm_text)

def _standardized_text(text, token_filter):
    tokens = simple_tokenize(text.replace('_', ' '))
    if token_filter is not None:
        tokens = token_filter(tokens)
    return '_'.join(tokens)

def simple_tokenize(text):
    """
    Tokenize text using the default wordfreq rules.
    """
    return wordfreq.tokenize(text, 'xx')
numberbatch.similarity(standardized_uri('hi', 'लड़का'), standardized_uri('en', 'boy'))
// 0.94321722690479937
n.most_similar(positive=[standardized_uri('en', 'boy')], negative=[], topn=100)
# [('/c/en/one_of_boys', 0.9874493479728699),
#  ('/c/no/gutt', 0.9785364270210266),
#  ('/c/da/dreng', 0.9698919057846069),
#  ('/c/ko/소년', 0.9656598567962646),
#  ('/c/th/เด็กผู้ชาย', 0.9656425714492798),
#  ('/c/eo/knabo', 0.9644594192504883),
#  ('/c/sh/dječak', 0.9644368290901184),
#  ('/c/te/బాలుడు', 0.9585086703300476),
#  ('/c/fi/nallikka', 0.9545205235481262),
#  ('/c/af/seuntjie', 0.9545205235481262),
#  ('/c/is/strákur', 0.9543156027793884),
#  ('/c/is/piltur', 0.9539886116981506),
#  ('/c/no/gut', 0.953417181968689),
#  ('/c/et/poiss', 0.9525488018989563),
#  ('/c/ro/băiat', 0.95219886302948),
#  ('/c/en/male_child', 0.9515728950500488),
#  ('/c/cs/chlapec', 0.9511594176292419),
#  ('/c/pl/chłopaczek', 0.9509129524230957),
#  ('/c/el/αγορι', 0.9485906362533569),
#  ('/c/ca/minyó', 0.9483424425125122),
#  ('/c/lv/puika', 0.9478499293327332),
#  ('/c/ja/男の_コ', 0.9478290677070618),
#  ('/c/is/stráksi', 0.9469888806343079),
#  ('/c/sw/mvulana', 0.9467336535453796),
#  ('/c/ja/男の児', 0.9465839266777039),
#  ('/c/sv/pilt', 0.9456952214241028),
#  ('/c/ga/garsún', 0.9450075030326843),
#  ('/c/hi/लड़का', 0.9432169795036316),
#  ('/c/ca/noi', 0.9422826766967773),
#  ('/c/sv/pojke', 0.9395612478256226),
#  ('/c/pt/garotinho', 0.93843013048172),
#  ('/c/ja/坊ち', 0.9382251501083374),
#  ('/c/ar/وَلَد', 0.9382251501083374),
#  ('/c/lv/zēns', 0.9378499388694763),
#  ('/c/fil/iho', 0.9346551299095154),
#  ('/c/lv/puisēns', 0.9341316819190979),
#  ('/c/is/drengur', 0.933497428894043),
#  ('/c/lv/puisītis', 0.9331524968147278),
#  ('/c/eo/knabego', 0.9320058226585388),
#  ('/c/cs/hoch', 0.9316845536231995),
#  ('/c/ar/فَتَى', 0.9304484128952026),
#  ('/c/ku/مندار', 0.9279160499572754),
#  ('/c/th/เด็กชาย', 0.9273474812507629),
#  ('/c/sv/gosse', 0.9269068837165833),
#  ('/c/tr/çoçuk', 0.9266089797019958),
#  ('/c/ja/男児', 0.9250114560127258),
#  ('/c/nl/mannelijke_baby', 0.9247658252716064),
#  ('/c/nl/mannelijk_kind', 0.9247658252716064),
#  ('/c/tr/oğlan', 0.9233790636062622),
#  ('/c/es/muchacho', 0.9222103357315063),
#  ('/c/sk/chlapec', 0.9214852452278137),
#  ('/c/cy/bachgen', 0.92059326171875),
#  ('/c/it/ragazzo', 0.9198938608169556),
#  ('/c/ast/rapaz', 0.9196257591247559),
#  ('/c/ga/gasúr', 0.9189478158950806),
#  ('/c/hu/fiú', 0.9188843965530396),
#  ('/c/ro/băieți', 0.9180207252502441),
#  ('/c/sh/дечак', 0.9178452491760254),
#  ('/c/lv/puisis', 0.9175843000411987),
#  ('/c/ang/cnapa', 0.9171717166900635),
#  ('/c/lv/zeņķis', 0.9169554114341736),
#  ('/c/fr/jeune_garçon', 0.9164935946464539),
#  ('/c/eo/knabeto', 0.9159650802612305),
#  ('/c/pt/rapazola', 0.9152440428733826),
#  ('/c/ja/男の子', 0.9131519198417664),
#  ('/c/uk/хлопець', 0.9127181172370911),
#  ('/c/sq/djalë', 0.9113969206809998),
#  ('/c/cs/ogar', 0.9107626676559448),
#  ('/c/pt/rapazinho', 0.910697877407074),
#  ('/c/fr/garçon', 0.909964919090271),
#  ('/c/it/ragazzino', 0.9091301560401917),
#  ('/c/ka/ბიჭი', 0.9076249003410339),
#  ('/c/sl/deček', 0.9061344265937805),
#  ('/c/la/puerulus', 0.9049318432807922),
#  ('/c/ja/彦', 0.9044259786605835),
#  ('/c/ang/cnafa', 0.9029330611228943),
#  ('/c/sv/grabb', 0.9010026454925537),
#  ('/c/de/knabe', 0.9005921483039856),
#  ('/c/ja/童男', 0.9004682898521423),
#  ('/c/gd/balach', 0.8983445167541504),
#  ('/c/pt/mancebo', 0.8974050283432007),
#  ('/c/ga/buachaill', 0.8962365388870239),
#  ('/c/pt/garotão', 0.8949055075645447),
#  ('/c/es/churumbel', 0.8935650587081909),
#  ('/c/hu/srác', 0.8919544219970703),
#  ('/c/xcl/պատանեկիկ', 0.8910072445869446),
#  ('/c/it/ragazzetto', 0.8906833529472351),
#  ('/c/es/chaval', 0.890276312828064),
#  ('/c/bg/момче', 0.8902152180671692),
#  ('/c/gd/laochan', 0.8901078104972839),
#  ('/c/de/junge', 0.8900668025016785),
#  ('/c/io/puerulo', 0.8896881341934204),
#  ('/c/pt/menino', 0.8894144296646118),
#  ('/c/sq/çun', 0.8889287114143372),
#  ('/c/ur/لڑکا', 0.8888120651245117),
#  ('/c/es/chico', 0.8873262405395508),
#  ('/c/ja/和郎', 0.8868381381034851),
#  ('/c/fi/poika', 0.8863088488578796),
#  ('/c/lt/vaikinas', 0.88576340675354),
#  ('/c/sa/किशोर', 0.885681688785553)]