Skip to content

Instantly share code, notes, and snippets.

View josepablog's full-sized avatar

José P. González-Brenes josepablog

View GitHub Profile
# -*- coding: utf-8 -*-
"""
common-crawl-cdx.py
A simple example program to analyze the Common Crawl index.
This is implemented as a single stream job which accesses S3 via HTTP,
so that it can be easily be run from any laptop, but it could easily be
converted to an EMR job which processed the 300 index files in parallel.
@TomAugspurger
TomAugspurger / to_redshift.py
Last active September 16, 2021 16:55
to_redshift.py
# see also https://github.com/wrobstory/pgshift
import gzip
from io import StringIO, BytesIO
from functools import wraps
import boto
from sqlalchemy import MetaData
from pandas import DataFrame
from pandas.io.sql import SQLTable, pandasSQL_builder
@mblondel
mblondel / kmeans.py
Last active April 21, 2024 13:41
Fuzzy K-means and K-medians
# Copyright Mathieu Blondel December 2011
# License: BSD 3 clause
import numpy as np
import pylab as pl
from sklearn.base import BaseEstimator
from sklearn.utils import check_random_state
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans as KMeansGood