Alexander Galea agalea91

## bigquery_csv_upload.py
creds = "/path/to/creds.json"

df = pd.read_csv("file.csv")
# check df's dtypes and column names

df.to_parquet("gs://bucket/file.parquet", storage_options=dict(token=creds))

# then upload through console. auto-detects schema

## pandas-commands.md

      
              1 file
            
          
              8 forks
            
          
              2 comments
            
          
              13 stars
            
          
                agalea91
                / pandas-commands.md
            
            
              Last active
              June 17, 2022 07:26
            
              
                Useful commands for the pandas dataframe library for python.
              
          
    Useful commands for Pandas dataframes

import pandas as pd

Loading data


from .csv

df = pd.read_csv('file.csv', header=1)
from dictionary

df = pd.DataFrame(dict)
from lists

df = pd.DataFrame([[y, x1_1, x2_1, ...], [y, x1_2, x2_2, ...], ... ])


## youtube_dl_best_video_audio.sh
# Usage:
# ./youtube_dl_best_video_audio.sh 'https://www.youtube.com/watch?v=8pp3cP_VvjY'
youtube-dl -f bestvideo+bestaudio -k $1

## jupyter_notebook_config.py
# File path:
# ~/.jupyter/jupyter_notebook_config.py

import os
from subprocess import check_call
import datetime
import re

def timestamped_file(fname):
    return bool(re.match('.*\d{4}-\d{2}-\d{2}\.ipynb', fname))

## ffmpeg_mp3_join.sh
ffmpeg -i "concat:file 1.mp3|file 2.mp3|file 3.mp3|..." -acodec copy ../Output-File.mp3

## jsonl_io.py
import json

def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)

## redshift_database_table_transfer.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                agalea91
                / redshift_database_table_transfer.md
            
            
              Created
              April 21, 2021 19:35
            
          
    Unload to s3

unload ('select * from schema.table') 
to 's3://path/to/s3/folder/'
-- iam_role 'arn:aws:iam::<aws-account-id>:role/<role-name>'
-- credentials 'aws_access_key_id=<key>;aws_secret_access_key=<secret>'
allowoverwrite
format as csv;


## sqlalchemy_execute_raw_sql.py
def get_engine():
    import sqlalchemy
    import os
    # os.environ["DB_URL"] = "postgresql://USER:PASS@HOST:PORT/DATABASE"
    engine = sqlalchemy.create_engine(os.environ["DB_URL"])
    return engine

def close_engine(engine):
    engine.dispose()

## ssh_pd_read_csv.py
SSH_HOST = ""
SSH_USER = ""
SSH_FILE_PATH = ""
COMPRESSION = "zip"

from paramiko import SSHClient

client = SSHClient()
client.load_system_host_keys()
client.connect(SSH_HOST, username=SSH_USER)

## s3_gs_bigquery_upload.sh
aws s3 cp s3://my_bucket/my_table /tmp/my_table --recursive \
&& gcloud config set account <name>@<project>.iam.gserviceaccount.com \
&& gcloud config set project <project> \
&& gcloud auth activate-service-account <name>@<project>.iam.gserviceaccount.com --key-file /my_path_to/google_credentials.json
&& gsutil cp -r /tmp/my_table gs://bucket_name/my_table \
&& bq mk --dataset my_dataset \
&& bq mk --table --schema my_table_schema.json my_dataset.my_table \
&& bq query --nouse_legacy_sql 'delete from my_dataset.my_table where <date_range_where_clause>;' \
&& bq load --source_format=CSV --quote "" -F='\t' my_dataset.my_table gs://bucket_name/my_table/0000_part_00 \
&& bq load --source_format=CSV --quote "" -F='\t' my_dataset.my_table gs://bucket_name/my_table/0001_part_00 \
	creds = "/path/to/creds.json"

	df = pd.read_csv("file.csv")
	# check df's dtypes and column names

	df.to_parquet("gs://bucket/file.parquet", storage_options=dict(token=creds))

	# then upload through console. auto-detects schema
	# Usage:
	# ./youtube_dl_best_video_audio.sh 'https://www.youtube.com/watch?v=8pp3cP_VvjY'
	youtube-dl -f bestvideo+bestaudio -k $1
	# File path:
	# ~/.jupyter/jupyter_notebook_config.py

	import os
	from subprocess import check_call
	import datetime
	import re

	def timestamped_file(fname):
	return bool(re.match('.*\d{4}-\d{2}-\d{2}\.ipynb', fname))
	import json

	def dump_jsonl(data, output_path, append=False):
	"""
	Write list of objects to a JSON lines file.
	"""
	mode = 'a+' if append else 'w'
	with open(output_path, mode, encoding='utf-8') as f:
	for line in data:
	json_record = json.dumps(line, ensure_ascii=False)
	def get_engine():
	import sqlalchemy
	import os
	# os.environ["DB_URL"] = "postgresql://USER:PASS@HOST:PORT/DATABASE"
	engine = sqlalchemy.create_engine(os.environ["DB_URL"])
	return engine

	def close_engine(engine):
	engine.dispose()
	SSH_HOST = ""
	SSH_USER = ""
	SSH_FILE_PATH = ""
	COMPRESSION = "zip"

	from paramiko import SSHClient

	client = SSHClient()
	client.load_system_host_keys()
	client.connect(SSH_HOST, username=SSH_USER)
	aws s3 cp s3://my_bucket/my_table /tmp/my_table --recursive \
	&& gcloud config set account <name>@<project>.iam.gserviceaccount.com \
	&& gcloud config set project <project> \
	&& gcloud auth activate-service-account <name>@<project>.iam.gserviceaccount.com --key-file /my_path_to/google_credentials.json
	&& gsutil cp -r /tmp/my_table gs://bucket_name/my_table \
	&& bq mk --dataset my_dataset \
	&& bq mk --table --schema my_table_schema.json my_dataset.my_table \
	&& bq query --nouse_legacy_sql 'delete from my_dataset.my_table where <date_range_where_clause>;' \
	&& bq load --source_format=CSV --quote "" -F='\t' my_dataset.my_table gs://bucket_name/my_table/0000_part_00 \
	&& bq load --source_format=CSV --quote "" -F='\t' my_dataset.my_table gs://bucket_name/my_table/0001_part_00 \