Skip to content

Instantly share code, notes, and snippets.

#!/bin/bash
deactive_proxy () {
if [ ! "$1" = "nondestructive" ] ; then
unset -f deactive_proxy
if [ -n "$_OLD_HTTP_PROXY" ] ; then
http_proxy="$_OLD_HTTP_PROXY"
export http_proxy
unset _OLD_HTTP_PROXY
else
@idiomer
idiomer / pyspark_hdfs_utils.py
Last active June 18, 2024 08:21
Using PySpark to handle HDFS, such as list (ls), rename (mv), delete (rm)
'''
The path is a directory by default
'''
def hdfs_list(path, subtract_one=True):
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
list_status = fs.listStatus(spark._jvm.org.apache.hadoop.fs.Path(path))
# file.getPath().getName(), file.getBlockSize(), file.getLen()
files_size = [file.getLen() for file in list_status]
totol_size_in_MB = sum(files_size) / 1024.0 / 1024.0
set -ex
# method 1
# [2020-01-01, 2020-01-31]
for i in {0..30}; do
thedate=$(date -I -d "2020-01-01 +$i days")
echo $thedate
done
# method 2
@idiomer
idiomer / unique_everseen.py
Created March 20, 2020 04:16
元素排重,且保持原有顺序
def unique_everseen(iterable, key=None):
"List unique elements, preserving order. Remember all elements ever seen."
# unique_everseen('AAAABBBCCDAABBB') --> A B C D
# unique_everseen('ABBCcAD', str.lower) --> A B C D
from itertools import filterfalse
seen = set()
seen_add = seen.add
if key is None:
for element in filterfalse(seen.__contains__, iterable):
@idiomer
idiomer / .gitignore
Last active April 14, 2020 08:08
gitignore for python project
*~
*.swp
*.pyc
*.pyo
__pycache__/
*.bak
*.backup
bak/
backup/
*.tmp
# 启用鼠标
set -g mouse on
# 屏幕回滚最大行数
set-option -g history-limit 10000
# 在tmux中按前缀+r可重载当前配置
bind r source-file ~/.tmux.conf \; display "Reloaded!"
@idiomer
idiomer / dictUtils.py
Last active August 18, 2020 09:11
用点来访问dict里面的key
class dotdict(dict):
"""dot.notation access to dictionary attributes"""
__getattr__ = dict.get
__setattr__ = dict.__setitem__
__delattr__ = dict.__delitem__
class TrieTree:
""" 字典树 """
class Node:
@idiomer
idiomer / multiple_read.py
Last active July 17, 2020 03:10
多进程读取多个数据文件
from glob import glob
import multiprocessing
from tqdm import tqdm
import pandas as pd
def json_reader(fname):
df = pd.read_json(fname, lines=True)
return df
@idiomer
idiomer / dateUtils.py
Last active January 26, 2021 11:41
python date utils: yesterday, dateAdd, dateSub, dateRange
def get_yesterday(n_days_ago=1, FMT='%Y-%m-%d'):
import datetime
return datetime.datetime.strftime(
datetime.datetime.now() - datetime.timedelta(n_days_ago), FMT
)
def dateAdd(thedate, num, FMT='%Y-%m-%d'):
import datetime
strptime, strftime = datetime.datetime.strptime, datetime.datetime.strftime
@idiomer
idiomer / facets_overview_display.py
Last active August 7, 2020 06:48
用facets_overview查看dataset的概览
import base64
from IPython.core.display import display, HTML
# pip install facets_overview
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator
def display_overview(df_train, df_test=None):
gfsg = GenericFeatureStatisticsGenerator()
if df_test is not None:
proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': df_train},