Miopas Miopas

## download_file_from_gcp.py
'''
To access your bucket in Linux, you need to download the project credential from GCP and set it the environment variable as
 > export GOOGLE_APPLICATION_CREDENTIALS='/path/to/dir/*.json'
Same for uploading files.
'''
import sys
from google.cloud import storage

def download_blob(bucket_name, source_blob_name, destination_file_name):
    '''Downloads a blob from the bucket.'''

## download_file_from_google_drive.py
import requests
import pdb
import sys

def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)

## parse_bz2.py
'''
python parse_bz2.py *.bz2 ${dest}
'''

import sys
from bz2 import BZ2File as bzopen
import json
import pandas as pd

infile = sys.argv[1]

## read_zst.py
# python 3.6
import zstandard
import pathlib
import shutil
import os
import math
import pandas as pd
import sys

def decompress_zstandard_to_folder(input_file, destination_dir):

## latex.txt
# fix the order of citations
\bibliographystyle{unsrtnat}
\usepackage[numbers,sort&compress]{natbib}


# figures layout
# credit: https://tex.stackexchange.com/questions/129077/figure-in-beamer
\begin{tabular}{p{.3\textwidth} p{.7\textwidth}}
\adjincludegraphics[width=.8\linewidth,valign=t]{example-image}
&

## awk.sh
# word count
cat result | awk -F"\t" '{key=$1"\t"$2; c[key]++} END {for (i in c) print c[i],i}'

# sum, count, mean, median, min, max
# credit:https://unix.stackexchange.com/questions/13731/is-there-a-way-to-get-the-min-max-median-and-average-of-a-list-of-numbers-in
#!/bin/sh
cat test.txt | sort -n | awk '
  BEGIN {
    c = 0;
    sum = 0;

## my_iTerm2.json
{
  "Ansi 5 Color" : {
    "Green Component" : 0.10802463442087173,
    "Red Component" : 0.77738940715789795,
    "Blue Component" : 0.43516635894775391
  },
  "Tags" : [

  ],
  "Ansi 12 Color" : {

## html_entity_convert.py
# reference: https://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string

from bs4 import BeautifulSoup
BeautifulSoup("<p>&pound;682m</p>")

# output is like:
# <html><body><p>£682m</p></body></html>


## gensim.py
# train
import logging
import os
from gensim.models import word2vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences = word2vec.LineSentence('/path/to/your/data')

model = word2vec.Word2Vec(sentences,

## neo4j.sh
./cypher-shell -a bolt://localhost:1688 -u neo4j -p work "match(n)-[r]->(m) return n, r, m limit 10;" >/tmp/a.txt
	'''
	To access your bucket in Linux, you need to download the project credential from GCP and set it the environment variable as
	> export GOOGLE_APPLICATION_CREDENTIALS='/path/to/dir/*.json'
	Same for uploading files.
	'''
	import sys
	from google.cloud import storage

	def download_blob(bucket_name, source_blob_name, destination_file_name):
	'''Downloads a blob from the bucket.'''
	import requests
	import pdb
	import sys

	def download_file_from_google_drive(id, destination):
	URL = "https://docs.google.com/uc?export=download"

	session = requests.Session()

	response = session.get(URL, params = { 'id' : id }, stream = True)
	'''
	python parse_bz2.py *.bz2 ${dest}
	'''

	import sys
	from bz2 import BZ2File as bzopen
	import json
	import pandas as pd

	infile = sys.argv[1]
	# python 3.6
	import zstandard
	import pathlib
	import shutil
	import os
	import math
	import pandas as pd
	import sys

	def decompress_zstandard_to_folder(input_file, destination_dir):
	# fix the order of citations
	\bibliographystyle{unsrtnat}
	\usepackage[numbers,sort&compress]{natbib}


	# figures layout
	# credit: https://tex.stackexchange.com/questions/129077/figure-in-beamer
	\begin{tabular}{p{.3\textwidth} p{.7\textwidth}}
	\adjincludegraphics[width=.8\linewidth,valign=t]{example-image}
	&
	# word count
	cat result \| awk -F"\t" '{key=$1"\t"$2; c[key]++} END {for (i in c) print c[i],i}'

	# sum, count, mean, median, min, max
	# credit:https://unix.stackexchange.com/questions/13731/is-there-a-way-to-get-the-min-max-median-and-average-of-a-list-of-numbers-in
	#!/bin/sh
	cat test.txt \| sort -n \| awk '
	BEGIN {
	c = 0;
	sum = 0;
	{
	"Ansi 5 Color" : {
	"Green Component" : 0.10802463442087173,
	"Red Component" : 0.77738940715789795,
	"Blue Component" : 0.43516635894775391
	},
	"Tags" : [

	],
	"Ansi 12 Color" : {
	# reference: https://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string

	from bs4 import BeautifulSoup
	BeautifulSoup("<p>£682m</p>")

	# output is like:
	# <html><body><p>£682m</p></body></html>
	# train
	import logging
	import os
	from gensim.models import word2vec

	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

	sentences = word2vec.LineSentence('/path/to/your/data')

	model = word2vec.Word2Vec(sentences,