Skip to content

Instantly share code, notes, and snippets.

@odinokov
odinokov / download_file.py
Last active March 15, 2023 04:38
Downloads a file from the given URL and saves it to the specified file path.
import os
import requests
import shutil
from tqdm import tqdm
def download_file(url: str, file_path: str) -> None:
"""
Downloads a file from the given URL and saves it to the specified file path.
Shows a progress bar using the tqdm library.
@odinokov
odinokov / print_columns.py
Created March 14, 2023 10:16
Prints the elements of a list in columns.
def print_columns(data: list, num_cols : int):
"""
Prints the elements of a list in columns.
Args:
data: A list of data to be printed.
num_cols: The number of columns to print.
Returns:
None
@odinokov
odinokov / covar.py
Last active June 22, 2022 08:23
covar
# remove covariance in order to reduce the number of features
import numpy as np
def drop_covariance(df, threshold=0.95):
# Create correlation matrix
corr_matrix = df.corr().abs()
@odinokov
odinokov / simhash.py
Last active May 24, 2022 05:56
How simhash works
# read more about
# http://matpalm.com/resemblance/simhash/
#
from collections import Counter
from itertools import chain
from toolz import pipe
import numpy as np
global HASH_SIZE
@odinokov
odinokov / rocksdb.py
Last active May 23, 2022 12:48
how to use RocksDB
"""
http://rocksdb.org
sudo apt-get install librocksdb-dev
pip3 install Cython python-rocksdb
"""
import rocksdb, gc
import numpy as np
rocksdb_fm_path = 'test.db'
@odinokov
odinokov / r_setup.rmd
Last active May 10, 2022 04:56
an example of rmd chunk
```{r setup, include=TRUE}
# clean up R environment
rm(list = ls(all = TRUE))
# permanently setting the CRAN repository
options(repos = getOption("repos")["CRAN"])
if (!requireNamespace("pacman", quietly = TRUE))
install.packages("pacman")
@odinokov
odinokov / gsheets.py
Last active May 23, 2022 12:49
how to get access to Google Sheets from Python
# pip3 install pygsheets -user
import pygsheets
import sys
# Get user's authorization as a Service Account
# Turn on Google Drive and Google Sheets API
SPREADSHEET_NAME = 'ABCD'
key = './pygsheets-348111-b94d5f8fa9f2.json'
gc = pygsheets.authorize(service_file=key)
# do prior running the script:
@odinokov
odinokov / modin.py
Last active April 28, 2022 02:50
modin
import modin.pandas as pd
from modin.config import Engine
# Engine.put("ray") # Modin will use Ray
Engine.put("dask") # Modin will use Dask
@odinokov
odinokov / bed2bins.py
Last active April 13, 2022 05:59
Binning regions of bed file
%%writefile bed2bins.py
# instead of https://bedtools.readthedocs.io/en/latest/content/tools/makewindows.html
# cat file_name.bam | bedtools makewindows -b - -n 5 -i winnum
#
# How to use:
# cat file_name.bam | python3 bed2bins.py 5
#
# expects to stdin from bam file with 6 fields, i.e.,
#
@odinokov
odinokov / padding_to_the_longest.py
Last active October 7, 2022 03:38
load tsv with float values of various length into memory
# load tsv with float values of various length as numpy array
import numpy as np
from itertools import zip_longest
file_name = # specify
with open(file_name) as file:
lines = [np.asarray(line.rstrip().split('\t')).astype(np.float16) for line in file]