Radim Řehůřek piskvorky

## gist:dccb0d12153d4fa248e1
def unescape(text):
    """Unescape HTML entities. Input is either unicode or utf8 string; output is always utf8 string."""
    # adapted from http://effbot.org/zone/re-sub.htm#unescape-html
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))

## gist:5659591d1b32c735c851
#!/usr/bin/env bash
# memusg -- Measure memory usage of processes
# Usage: memusg COMMAND [ARGS]...
#
# Author: Jaeho Shin <netj@sparcs.org>
# Created: 2010-08-16
set -um

# check input
[ $# -gt 0 ] || { sed -n '2,/^#$/ s/^# //p' <"$0"; exit 1; }

## f1c51f_doctopics.txt
#doc name topic proportion ...
0	0	75	0.27291790375080566	32	0.21542062655806327	91	0.12927086372321364	68	0.11494560291730632	61	0.08635709018778953	73	0.08631053678064	57	0.04319966089780646	28	0.02887936727834596	35	0.014432704123391758	2	2.7911487760191677E-4	45	2.725940123978988E-4	70	2.722625036959093E-4	1	2.3256451843164092E-4	4	2.259570167639352E-4	90	2.2474410586609787E-4	11	2.1430911691440254E-4	58	2.1245904887584203E-4	37	1.8123208755492314E-4	34	1.7234105812375972E-4	15	1.6427407506024936E-4	19	1.632136015179282E-4	10	1.61607028408238E-4	40	1.5930526806191378E-4	51	1.5772117871213373E-4	65	1.5680468190552038E-4	80	1.4563788697690428E-4	99	1.4313460405450642E-4	53	1.4292366928899937E-4	9	1.4076808293294913E-4	59	1.3759332248082507E-4	84	1.335961877850498E-4	41	1.2204449635956596E-4	74	1.1888716247753189E-4	50	1.1866772486063238E-4	76	1.1260766559875016E-4	98	1.1233681058558284E-4	22	1.0940677855090405E-4	56	9.196073733499436E-5	64	9.181876832622669E-5	42	9.119024203586985E-5	72	8.932367891104672E-5

## gist:84e6a910b28cf8cbbed6e1618499a280
(st)[kofola3@kofola3:~/workspace/scaletext] (scaletext2)$ python -m scaletext.scripts.load_tab_separated_data --es-index wiki1k ./enwiki-1k-articles.txt
2017-01-19 01:20:41,713 : MainProcess : INFO : running /Volumes/work/workspace/scaletext/scaletext/scripts/load_tab_separated_data.py --es-index wiki1k ./enwiki-1k-articles.txt
2017-01-19 01:20:43,165 : MainProcess : INFO : 100 documents loaded; last: Art
2017-01-19 01:20:44,371 : MainProcess : INFO : 200 documents loaded; last: Albert Camus
2017-01-19 01:20:45,525 : MainProcess : INFO : 300 documents loaded; last: Atomic
2017-01-19 01:20:46,575 : MainProcess : INFO : 400 documents loaded; last: Dasyproctidae
2017-01-19 01:20:47,443 : MainProcess : INFO : 500 documents loaded; last: Afonso de Albuquerque
2017-01-19 01:20:48,176 : MainProcess : INFO : 600 documents loaded; last: Anacharsis
2017-01-19 01:20:49,008 : MainProcess : INFO : 700 documents loaded; last: Annealing
2017-01-19 01:20:50,112 : MainProcess : INFO : 800 documents loaded; last: Abijah

## method1.sh
curl -X GET 'http://localhost:9200/3500000_1000000/_search?size=100&pretty=1' -d '{"fields": [], "query": {"match": {"one_0_1_0": "0P1i0d2 1P1ineg0d1 2P1ineg0d0 3P1i0d1 4P1i0d1 5P1i0d0 6P1i0d1 7P1ineg0d0 8P1ineg0d0 9P1ineg0d0 10P1ineg0d1 11P1ineg0d0 12P1i0d1 13P1ineg0d0 14P1ineg0d0 15P1i0d0 16P1i0d1 17P1ineg0d1 18P1ineg0d1 19P1ineg0d0 20P1ineg0d1 21P1ineg0d0 22P1i0d1 23P1i0d1 24P1i0d0 25P1i0d0 26P1i0d1 27P1i0d0 28P1i0d0 29P1ineg0d1 30P1ineg0d0 31P1ineg0d0 32P1i0d1 33P1i0d1 34P1i0d0 35P1ineg0d1 36P1i0d1 37P1ineg0d1 38P1i0d1 39P1i0d1 40P1i0d1 41P1ineg0d1 42P1ineg0d1 43P1i0d1 44P1i0d0 45P1i0d1 46P1i0d0 47P1i0d1 48P1ineg0d0 49P1i0d0 50P1i0d1 51P1ineg0d0 52P1ineg0d1 53P1i0d0 54P1ineg0d0 55P1i0d0 56P1ineg0d0 57P1i0d1 58P1ineg0d0 59P1i0d1 60P1ineg0d1 61P1i0d0 62P1i0d0 63P1i0d0 64P1i0d0 65P1i0d1 66P1ineg0d0 67P1i0d1 68P1ineg0d0 69P1i0d1 70P1i0d0 71P1i0d0 72P1i0d1 73P1i0d0 74P1ineg0d0 75P1i0d0 76P1ineg0d0 77P1i0d0 78P1ineg0d0 79P1ineg0d0 80P1i0d1 81P1i0d0 82P1ineg0d1 83P1i0d1 84P1ineg0d0 85P1i0d0 86P1i0d0 87P1ineg0d0

## method2.sh
curl -X GET 'http://localhost:9200/f_3500000_1000000/_search?size=100&pretty=1' -d '{"query": {"bool": {"should": [{"range": {"213": {"boost": 1.2097011357545853, "gte": 0.0048505678772926275, "lte": 0.20485056787729264}}}, {"range": {"210": {"boost": 1.155477911233902, "gte": -0.177738955616951, "lte": 0.022261044383049017}}}, {"range": {"137": {"boost": 1.170813649892807, "gte": -0.014593175053596502, "lte": 0.1854068249464035}}}, {"range": {"24": {"boost": 1.213587835431099, "gte": 0.0067939177155494634, "lte": 0.20679391771554947}}}, {"range": {"1": {"boost": 1.4646067321300507, "gte": 0.13230336606502532, "lte": 0.3323033660650253}}}, {"range": {"27": {"boost": 1.1932272613048553, "gte": -0.003386369347572332, "lte": 0.19661363065242768}}}, {"range": {"21": {"boost": 1.1098769381642342, "gte": -0.1549384690821171, "lte": 0.045061530917882925}}}, {"range": {"23": {"boost": 1.141402691602707, "gte": -0.02929865419864655, "lte": 0.17070134580135346}}}, {"range": {"4": {"boost": 1.1348381340503693, "gte": -0

## gist:d02bb12713448fb2a61bb8007156e894
/Volumes/work/workspace/gensim/trunk/gensim/corpora/textcorpus.py:docstring of gensim.corpora.textcorpus.remove_stopwords:1: WARNING: Inline interpreted text or phrase reference start-string without end-string.
/Volumes/work/workspace/gensim/trunk/gensim/models/word2vec.py:docstring of gensim.models.word2vec.Word2Vec.score:13: WARNING: duplicate citation taddy, other instance in /Volumes/work/workspace/gensim/trunk/docs/src/models/doc2vec.rst
/Volumes/work/workspace/gensim/trunk/gensim/models/word2vec.py:docstring of gensim.models.word2vec.Word2Vec.score:14: WARNING: duplicate citation deepir, other instance in /Volumes/work/workspace/gensim/trunk/docs/src/models/doc2vec.rst
/Volumes/work/workspace/gensim/trunk/gensim/models/wrappers/fasttext.py:docstring of gensim.models.wrappers.fasttext.FastText.score:13: WARNING: duplicate citation taddy, other instance in /Volumes/work/workspace/gensim/trunk/docs/src/models/word2vec.rst
/Volumes/work/workspace/gensim/trunk/gensim/models/wrappers/fasttext.py:docstring of

## gist:1ae399373524ee6dc5640fe15e2076da
[kofola3@kofola3:~/workspace/bounter] (master)$ python setup.py test
running test
running egg_info
writing bounter.egg-info/PKG-INFO
writing top-level names to bounter.egg-info/top_level.txt
writing dependency_links to bounter.egg-info/dependency_links.txt
writing pbr to bounter.egg-info/pbr.json
reading manifest file 'bounter.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'bounter.egg-info/SOURCES.txt'

## gist:84f1c3ab351c5d88397130298509c47b
======================================================================
ERROR: test_iteritems (bounter.tests.hashtable.test_htc_iteration.HashTableIterationTest)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/Volumes/work/workspace/bounter/bounter/tests/hashtable/test_htc_iteration.py", line 43, in test_iteritems
    self.assertEqual(set(self.ht.iteritems()), self.pairs)
AttributeError: 'bounter_htc.HT_Basic' object has no attribute 'iteritems'

======================================================================
ERROR: test_iterkeys (bounter.tests.hashtable.test_htc_iteration.HashTableIterationTest)

## example.md

      
        
          
            
              
              1 file
            
          
          
            
              
              0 forks
            
          
          
            
              
              0 comments
            
          
          
            
              
              0 stars
            
          
        
        
          
              
          
          
            
                piskvorky
                / example.md
            
            
              Last active
              November 4, 2017 20:52
            
          
        
      
        
  
      
    🌟 New features:

Massive optimizations to LSI model training (@isamaru, #1620 & #1622)

LSI model allows use of single precision (float32), to consume  40% less memory while being 40% faster.
LSI model can now also accept CSC matrix as input, for further memory and speed boost.
Overall, if your entire corpus fits in RAM: 3x faster LSI training (SVD) in 4x less memory!
# just an example; the corpus stream is up to you
streaming_corpus = gensim.corpora.MmCorpus("my_tfidf_corpus.mm.gz")
	def unescape(text):
	"""Unescape HTML entities. Input is either unicode or utf8 string; output is always utf8 string."""
	# adapted from http://effbot.org/zone/re-sub.htm#unescape-html
	def fixup(m):
	text = m.group(0)
	if text[:2] == "&#":
	# character reference
	try:
	if text[:3] == "&#x":
	return unichr(int(text[3:-1], 16))
	#!/usr/bin/env bash
	# memusg -- Measure memory usage of processes
	# Usage: memusg COMMAND [ARGS]...
	#
	# Author: Jaeho Shin <netj@sparcs.org>
	# Created: 2010-08-16
	set -um

	# check input
	[ $# -gt 0 ] \|\| { sed -n '2,/^#$/ s/^# //p' <"$0"; exit 1; }
	(st)[kofola3@kofola3:~/workspace/scaletext] (scaletext2)$ python -m scaletext.scripts.load_tab_separated_data --es-index wiki1k ./enwiki-1k-articles.txt
	2017-01-19 01:20:41,713 : MainProcess : INFO : running /Volumes/work/workspace/scaletext/scaletext/scripts/load_tab_separated_data.py --es-index wiki1k ./enwiki-1k-articles.txt
	2017-01-19 01:20:43,165 : MainProcess : INFO : 100 documents loaded; last: Art
	2017-01-19 01:20:44,371 : MainProcess : INFO : 200 documents loaded; last: Albert Camus
	2017-01-19 01:20:45,525 : MainProcess : INFO : 300 documents loaded; last: Atomic
	2017-01-19 01:20:46,575 : MainProcess : INFO : 400 documents loaded; last: Dasyproctidae
	2017-01-19 01:20:47,443 : MainProcess : INFO : 500 documents loaded; last: Afonso de Albuquerque
	2017-01-19 01:20:48,176 : MainProcess : INFO : 600 documents loaded; last: Anacharsis
	2017-01-19 01:20:49,008 : MainProcess : INFO : 700 documents loaded; last: Annealing
	2017-01-19 01:20:50,112 : MainProcess : INFO : 800 documents loaded; last: Abijah
	/Volumes/work/workspace/gensim/trunk/gensim/corpora/textcorpus.py:docstring of gensim.corpora.textcorpus.remove_stopwords:1: WARNING: Inline interpreted text or phrase reference start-string without end-string.
	/Volumes/work/workspace/gensim/trunk/gensim/models/word2vec.py:docstring of gensim.models.word2vec.Word2Vec.score:13: WARNING: duplicate citation taddy, other instance in /Volumes/work/workspace/gensim/trunk/docs/src/models/doc2vec.rst
	/Volumes/work/workspace/gensim/trunk/gensim/models/word2vec.py:docstring of gensim.models.word2vec.Word2Vec.score:14: WARNING: duplicate citation deepir, other instance in /Volumes/work/workspace/gensim/trunk/docs/src/models/doc2vec.rst
	/Volumes/work/workspace/gensim/trunk/gensim/models/wrappers/fasttext.py:docstring of gensim.models.wrappers.fasttext.FastText.score:13: WARNING: duplicate citation taddy, other instance in /Volumes/work/workspace/gensim/trunk/docs/src/models/word2vec.rst
	/Volumes/work/workspace/gensim/trunk/gensim/models/wrappers/fasttext.py:docstring of
	[kofola3@kofola3:~/workspace/bounter] (master)$ python setup.py test
	running test
	running egg_info
	writing bounter.egg-info/PKG-INFO
	writing top-level names to bounter.egg-info/top_level.txt
	writing dependency_links to bounter.egg-info/dependency_links.txt
	writing pbr to bounter.egg-info/pbr.json
	reading manifest file 'bounter.egg-info/SOURCES.txt'
	reading manifest template 'MANIFEST.in'
	writing manifest file 'bounter.egg-info/SOURCES.txt'
	======================================================================
	ERROR: test_iteritems (bounter.tests.hashtable.test_htc_iteration.HashTableIterationTest)
	----------------------------------------------------------------------
	Traceback (most recent call last):
	File "/Volumes/work/workspace/bounter/bounter/tests/hashtable/test_htc_iteration.py", line 43, in test_iteritems
	self.assertEqual(set(self.ht.iteritems()), self.pairs)
	AttributeError: 'bounter_htc.HT_Basic' object has no attribute 'iteritems'

	======================================================================
	ERROR: test_iterkeys (bounter.tests.hashtable.test_htc_iteration.HashTableIterationTest)