import pandas as pd
- from .csv
df = pd.read_csv('file.csv', header=1)
- from dictionary
df = pd.DataFrame(dict)
- from lists
df = pd.DataFrame([[y, x1_1, x2_1, ...], [y, x1_2, x2_2, ...], ... ])
creds = "/path/to/creds.json" | |
df = pd.read_csv("file.csv") | |
# check df's dtypes and column names | |
df.to_parquet("gs://bucket/file.parquet", storage_options=dict(token=creds)) | |
# then upload through console. auto-detects schema |
# Usage: | |
# ./youtube_dl_best_video_audio.sh 'https://www.youtube.com/watch?v=8pp3cP_VvjY' | |
youtube-dl -f bestvideo+bestaudio -k $1 |
# File path: | |
# ~/.jupyter/jupyter_notebook_config.py | |
import os | |
from subprocess import check_call | |
import datetime | |
import re | |
def timestamped_file(fname): | |
return bool(re.match('.*\d{4}-\d{2}-\d{2}\.ipynb', fname)) |
ffmpeg -i "concat:file 1.mp3|file 2.mp3|file 3.mp3|..." -acodec copy ../Output-File.mp3 |
import json | |
def dump_jsonl(data, output_path, append=False): | |
""" | |
Write list of objects to a JSON lines file. | |
""" | |
mode = 'a+' if append else 'w' | |
with open(output_path, mode, encoding='utf-8') as f: | |
for line in data: | |
json_record = json.dumps(line, ensure_ascii=False) |
def get_engine(): | |
import sqlalchemy | |
import os | |
# os.environ["DB_URL"] = "postgresql://USER:PASS@HOST:PORT/DATABASE" | |
engine = sqlalchemy.create_engine(os.environ["DB_URL"]) | |
return engine | |
def close_engine(engine): | |
engine.dispose() |
SSH_HOST = "" | |
SSH_USER = "" | |
SSH_FILE_PATH = "" | |
COMPRESSION = "zip" | |
from paramiko import SSHClient | |
client = SSHClient() | |
client.load_system_host_keys() | |
client.connect(SSH_HOST, username=SSH_USER) |
aws s3 cp s3://my_bucket/my_table /tmp/my_table --recursive \ | |
&& gcloud config set account <name>@<project>.iam.gserviceaccount.com \ | |
&& gcloud config set project <project> \ | |
&& gcloud auth activate-service-account <name>@<project>.iam.gserviceaccount.com --key-file /my_path_to/google_credentials.json | |
&& gsutil cp -r /tmp/my_table gs://bucket_name/my_table \ | |
&& bq mk --dataset my_dataset \ | |
&& bq mk --table --schema my_table_schema.json my_dataset.my_table \ | |
&& bq query --nouse_legacy_sql 'delete from my_dataset.my_table where <date_range_where_clause>;' \ | |
&& bq load --source_format=CSV --quote "" -F='\t' my_dataset.my_table gs://bucket_name/my_table/0000_part_00 \ | |
&& bq load --source_format=CSV --quote "" -F='\t' my_dataset.my_table gs://bucket_name/my_table/0001_part_00 \ |