maneesh disodia maneeshdisodia

## spikes.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                maneeshdisodia
                / spikes.ipynb
            
            
              Created
              November 7, 2023 21:50
                — forked from w121211/spikes.ipynb
            
              
                Identifying Spikes in timeseries data with Pandas
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## firestore_to_pandas_dataframe.py
import pandas as pd
from google.cloud import firestore

db = firestore.Client()
users = list(db.collection(u'users').stream())

users_dict = list(map(lambda x: x.to_dict(), users))
df = pd.DataFrame(users_dict)

## Offline-Dataflow.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                maneeshdisodia
                / Offline-Dataflow.md
            
            
              Created
              June 29, 2022 07:01
                — forked from elavenrac/Offline-Dataflow.md
            
              
                GCP Dataflow processing with no external IPs
              
          
    GCP Dataflow Pipelines

This gist is a detailed walkthrough on how to deploy python Dataflow pipelines in GCP to run without external IPs. Full code samples are available below.
This walkthrough assumes you have a already authenticated with gcloud login commands and have the appropriate IAM privileges to execute these operations.
Step 1 - Gather application dependencies

Since we are planning to use no external IPs on our dataflow worker nodes, we must package up all our application dependencies for an offline deployment. I highly recommend using a virtual environment as your global dependencies will be much more than your single application will require.
Dump your application dependencies into a single file.

  
## time-files-modified.py
#!/usr/bin/env python

import os

path = 'data'
os.chdir(path)
files = sorted(os.listdir(os.getcwd()), key=os.path.getmtime)

oldest = files[0]
newest = files[-1]

## apply_df_by_multiprocessing.py
import multiprocessing
import pandas as pd
import numpy as np

def _apply_df(args):
    df, func, kwargs = args
    return df.apply(func, **kwargs)

def apply_by_multiprocessing(df, func, **kwargs):
    workers = kwargs.pop('workers')

## jupyterhub_aws.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                maneeshdisodia
                / jupyterhub_aws.md
            
            
              Created
              March 11, 2019 11:35
                — forked from widdowquinn/jupyterhub_aws.md
            
              
                Set up JupyterHub on AWS
              
          
    JupyterHub on AWS

EC2 Setup


Log in to AWS
Go to a sensible region
Start a new instance with Ubuntu Trusty (14.04) - compute-optimised instances have a high vCPU:memory ratio, and the lowest-cost CPU time. c4.2xlarge is a decent choice.
Set security group (firewall) to have ports 22, 80, and 443 open (SSH, HTTP, HTTPS)
If you want a static IP address (for long-running instances) then select Elastic IP for this VM
If you want to use HTTPS, you'll probably need a paid certificate, or to use Amazon's Route 53 to get a non-Amazon domain (to avoid region blocking).


## aggregation in pandas groupby
#create fake data example taken from stackoverflow
df_example = pd.DataFrame({'CG':np.random.randint(0, 5, 100), 'Morph':np.random.choice(['S', 'E'], 100), 'R':np.random.rand(100) * -100})

def my_agg(x):
    x = x.sort_values('R')
    morph = x.head(1)['Morph'].values[0]
    diff = x.iloc[0]['R'] - x.iloc[1]['R']
    diff2 = -2.5*np.log10(sum(10**(-0.4*x['R'])))
    prop = (x['Morph'].iloc[1:] == 'S').mean()
    return pd.Series([morph, diff, diff2, prop], index=['morph', 'diff', 'diff2', 'prop'])

## string_wrapper
def search(text,n):
    '''Searches for text, and retrieves n words either side of the text, which are retuned seperatly'''
    word = r"\W*([\w]+)"
    groups = re.search(r'{}\W*{}{}'.format(word*n,'place',word*n), text).groups()
    return groups[:n],groups[n:]


t = "The world is a small place, we should try to take care of it."
search(t,3)
#(('is', 'a', 'small'), ('we', 'should', 'try'))

## string_wrapper
def search(text,n):
    '''Searches for text, and retrieves n words either side of the text, which are retuned seperatly'''
    word = r"\W*([\w]+)"
    groups = re.search(r'{}\W*{}{}'.format(word*n,'place',word*n), text).groups()
    return groups[:n],groups[n:]


t = "The world is a small place, we should try to take care of it."
search(t,3)
#(('is', 'a', 'small'), ('we', 'should', 'try'))
	import pandas as pd
	from google.cloud import firestore

	db = firestore.Client()
	users = list(db.collection(u'users').stream())

	users_dict = list(map(lambda x: x.to_dict(), users))
	df = pd.DataFrame(users_dict)
	#!/usr/bin/env python

	import os

	path = 'data'
	os.chdir(path)
	files = sorted(os.listdir(os.getcwd()), key=os.path.getmtime)

	oldest = files[0]
	newest = files[-1]
	import multiprocessing
	import pandas as pd
	import numpy as np

	def _apply_df(args):
	df, func, kwargs = args
	return df.apply(func, **kwargs)

	def apply_by_multiprocessing(df, func, **kwargs):
	workers = kwargs.pop('workers')
	#create fake data example taken from stackoverflow
	df_example = pd.DataFrame({'CG':np.random.randint(0, 5, 100), 'Morph':np.random.choice(['S', 'E'], 100), 'R':np.random.rand(100) * -100})

	def my_agg(x):
	x = x.sort_values('R')
	morph = x.head(1)['Morph'].values[0]
	diff = x.iloc[0]['R'] - x.iloc[1]['R']
	diff2 = -2.5np.log10(sum(10(-0.4x['R'])))
	prop = (x['Morph'].iloc[1:] == 'S').mean()
	return pd.Series([morph, diff, diff2, prop], index=['morph', 'diff', 'diff2', 'prop'])
	def search(text,n):
	'''Searches for text, and retrieves n words either side of the text, which are retuned seperatly'''
	word = r"\W*([\w]+)"
	groups = re.search(r'{}\W{}{}'.format(wordn,'place',word*n), text).groups()
	return groups[:n],groups[n:]


	t = "The world is a small place, we should try to take care of it."
	search(t,3)
	#(('is', 'a', 'small'), ('we', 'should', 'try'))