Radim Řehůřek piskvorky

## combined_climb.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Script to calculate possible final standings for "combined climbing" (Olympics 2020 format)
from incomplete in-progress results:

BEST-WORST POSSIBLE FINAL STANDINGS
===================================
A. Gines Lopez: 1-2

## gensim_benchmark.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2020 Radim Rehurek <me@radimrehurek.com>

"""
Help script (template) for benchmarking. Run with:

  /usr/bin/time --format "%E elapsed\n%Mk peak RAM" python gensim_benchmark.py ~/gensim-data/text9/text9.txt

## smaps.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2019 RARE Technologies s.r.o.
# Authors: Radim Rehurek <radim@rare-technologies.com>
# MIT License

"""
Find private/shared memory of one or more processes, identified by their process ids (PIDs).

## sum_sentence.c
#include <Python.h>

/*
  Return the sum of word lengths of all words (unicode strings) in the list `sentence`.
  Return -1 if `sentence` isn't a list, and -2 if any of its elements is not a unicode string.

  `sentence` and its elements are const = never changed inside this function, and guaranteed to live
  throughout its execution, so we don't bother updating any reference counts.
*/
static long long process_const_sentence(PyObject *sentence) {

## gist:cbac8a3df9fb605d6ee5fea2390962b6
looking for now-outdated files... none found
pickling environment... done
checking consistency... done
preparing documents... done
writing output... [  0%] about
writing output... [  1%] apiref
writing output... [  2%] changes_080
writing output... [  3%] corpora/bleicorpus
writing output... [  4%] corpora/corpora
writing output... [  5%] corpora/csvcorpus

## gist:416c8ab83970d067d148301f0be57a12
Warning, treated as error:
/Volumes/work/workspace/gensim/trunk/docs/src/viz/poincare.rst:4: WARNING: autodoc: failed to import module u'gensim.viz.poincare'; the following exception was raised:
Traceback (most recent call last):
  File "/Users/kofola3/workspace/vew/gensim/lib/python2.7/site-packages/sphinx/ext/autodoc.py", line 551, in import_object
    __import__(self.modname)
  File "/Volumes/work/workspace/gensim/trunk/gensim/viz/poincare.py", line 18, in <module>
    import plotly.graph_objs as go
ImportError: No module named plotly.graph_objs

make: *** [html] Error 1

## example.json
{
	"pii_type": "passport",
	"severity": "high",
	"file_format": ["pdf", "scanned", "archive"],
	"archive_name": "Visas for reInvent 2017.tar.gz",
	"file_name": "maria_p_scanned.pdf",
	"ingest_source": "s3://laptop_backups/maria/2017/11/Documents/",
	"pii_instances": [
		{"name": "Maria Pereira"},
		{"date_of_birth": "1984/07/10"},

## segment_wiki.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                piskvorky
                / segment_wiki.md
            
            
              Created
              November 11, 2017 15:36
            
          
    CLI script for extracting plain text out of a raw Wikipedia dump. This is a xml.bz2 file provided by MediaWiki and looks like wiki--pages-articles.xml.bz2 or wiki-latest-pages-articles.xml.bz2 (e.g. 14 GB: https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2).
It streams through all the XML articles using multiple cores (#cores - 1, by default), decompressing on the fly and extracting plain text article sections from each article.
For each extracted article, it prints its title, section names and plaintext section contents, in json-line format.
Examples

bash

  
## example.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                piskvorky
                / example.md
            
            
              Last active
              November 4, 2017 20:52
            
          
    🌟 New features:

Massive optimizations to LSI model training (@isamaru, #1620 & #1622)

LSI model allows use of single precision (float32), to consume  40% less memory while being 40% faster.
LSI model can now also accept CSC matrix as input, for further memory and speed boost.
Overall, if your entire corpus fits in RAM: 3x faster LSI training (SVD) in 4x less memory!
# just an example; the corpus stream is up to you
streaming_corpus = gensim.corpora.MmCorpus("my_tfidf_corpus.mm.gz")  


## gist:84f1c3ab351c5d88397130298509c47b
======================================================================
ERROR: test_iteritems (bounter.tests.hashtable.test_htc_iteration.HashTableIterationTest)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/Volumes/work/workspace/bounter/bounter/tests/hashtable/test_htc_iteration.py", line 43, in test_iteritems
    self.assertEqual(set(self.ht.iteritems()), self.pairs)
AttributeError: 'bounter_htc.HT_Basic' object has no attribute 'iteritems'

======================================================================
ERROR: test_iterkeys (bounter.tests.hashtable.test_htc_iteration.HashTableIterationTest)
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	"""
	Script to calculate possible final standings for "combined climbing" (Olympics 2020 format)
	from incomplete in-progress results:

	BEST-WORST POSSIBLE FINAL STANDINGS
	===================================
	A. Gines Lopez: 1-2
	#!/usr/bin/env python
	# -- coding: utf-8 --
	#
	# Copyright (C) 2020 Radim Rehurek <me@radimrehurek.com>

	"""
	Help script (template) for benchmarking. Run with:

	/usr/bin/time --format "%E elapsed\n%Mk peak RAM" python gensim_benchmark.py ~/gensim-data/text9/text9.txt
	#!/usr/bin/env python
	# -- coding: utf-8 --
	#
	# Copyright (C) 2019 RARE Technologies s.r.o.
	# Authors: Radim Rehurek <radim@rare-technologies.com>
	# MIT License

	"""
	Find private/shared memory of one or more processes, identified by their process ids (PIDs).
	#include <Python.h>

	/*
	Return the sum of word lengths of all words (unicode strings) in the list `sentence`.
	Return -1 if `sentence` isn't a list, and -2 if any of its elements is not a unicode string.

	`sentence` and its elements are const = never changed inside this function, and guaranteed to live
	throughout its execution, so we don't bother updating any reference counts.
	*/
	static long long process_const_sentence(PyObject *sentence) {
	looking for now-outdated files... none found
	pickling environment... done
	checking consistency... done
	preparing documents... done
	writing output... [ 0%] about
	writing output... [ 1%] apiref
	writing output... [ 2%] changes_080
	writing output... [ 3%] corpora/bleicorpus
	writing output... [ 4%] corpora/corpora
	writing output... [ 5%] corpora/csvcorpus
	Warning, treated as error:
	/Volumes/work/workspace/gensim/trunk/docs/src/viz/poincare.rst:4: WARNING: autodoc: failed to import module u'gensim.viz.poincare'; the following exception was raised:
	Traceback (most recent call last):
	File "/Users/kofola3/workspace/vew/gensim/lib/python2.7/site-packages/sphinx/ext/autodoc.py", line 551, in import_object
	__import__(self.modname)
	File "/Volumes/work/workspace/gensim/trunk/gensim/viz/poincare.py", line 18, in <module>
	import plotly.graph_objs as go
	ImportError: No module named plotly.graph_objs

	make: *** [html] Error 1
	{
	"pii_type": "passport",
	"severity": "high",
	"file_format": ["pdf", "scanned", "archive"],
	"archive_name": "Visas for reInvent 2017.tar.gz",
	"file_name": "maria_p_scanned.pdf",
	"ingest_source": "s3://laptop_backups/maria/2017/11/Documents/",
	"pii_instances": [
	{"name": "Maria Pereira"},
	{"date_of_birth": "1984/07/10"},
	======================================================================
	ERROR: test_iteritems (bounter.tests.hashtable.test_htc_iteration.HashTableIterationTest)
	----------------------------------------------------------------------
	Traceback (most recent call last):
	File "/Volumes/work/workspace/bounter/bounter/tests/hashtable/test_htc_iteration.py", line 43, in test_iteritems
	self.assertEqual(set(self.ht.iteritems()), self.pairs)
	AttributeError: 'bounter_htc.HT_Basic' object has no attribute 'iteritems'

	======================================================================
	ERROR: test_iterkeys (bounter.tests.hashtable.test_htc_iteration.HashTableIterationTest)