The top n questions data scientists ask
Data science doesn’t start with data, it starts with a problem…
The pipeline model is useful, but data scientists progress via a series of questions - what are those questions?
from elasticsearch.helpers import bulk | |
from elasticsearch import Elasticsearch | |
class ElasticIndexer(object): | |
""" | |
Create an ElasticSearch instance, and given a list of documents, | |
index the documents into ElasticSearch. | |
""" | |
def __init__(self): | |
self.elastic_search = Elasticsearch() |
import os | |
from sklearn.datasets.base import Bunch | |
from yellowbrick.download import download_all | |
## The path to the test data sets | |
FIXTURES = os.path.join(os.getcwd(), "data") | |
## Dataset loading mechanisms | |
datasets = { |
package main | |
import ( | |
"fmt" | |
"log" | |
"github.com/shirou/gopsutil/mem" | |
"github.com/shirou/gopsutil/cpu" | |
"github.com/shirou/gopsutil/disk" | |
"github.com/shirou/gopsutil/host" |
import os | |
import zipfile | |
import requests | |
import pandas as pd | |
WALKING_DATASET = ( | |
"https://archive.ics.uci.edu/ml/machine-learning-databases/00286/User%20Identification%20From%20Walking%20Activity.zip", | |
) | |
def download_data(path='data', urls=WALKING_DATASET): |
# kimchi.py | |
# For converting Python 2 pickles to Python 3 | |
import os | |
import dill | |
import pickle | |
import argparse | |
def convert(old_pkl): |
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# plot_classifier_comparison.py | |
""" | |
A comparison of a several classifiers in scikit-learn on synthetic datasets. | |
The point of this example is to illustrate the nature of decision boundaries | |
of different classifiers. | |
Particularly in high-dimensional spaces, data can more easily be separated | |
linearly and the simplicity of classifiers such as naive Bayes and linear SVMs |