boxabirds/Dockerfile

## Dockerfile
# commands to set up the Docker image
FROM tensorflow/tensorflow:latest

# for natural language processing
RUN pip install nltk

# general data analysis (statistical: like R for python)
RUN pip install pandas

# interesting visualisations
RUN pip install seaborn

# special library for NLP: tagging up a sentence with word-to-word dependencies
# e.g. "he was going to school" -- he referring to school
#93% accuracy on tagging speech. Google's Parsey McParseface has 94%
RUN pip install spacy

# machine learning library comes with library for reinforcement learning
# method used by DeepMind's AlphaGo.
RUN pip install pybrain

# unsupervised clustering using LDA (latent dirichlet allocation) adding
# semantics to text analysis
RUN pip install gensim

# open source data standard
RUN pip install sframe

# utility data sets for training models

# common need is to have bunch of stopwords in a variety of languages.
RUN python -m nltk.downloader stopwords

# set of useful groups of words e.g. word stems
RUN python -m nltk.downloader wordnet

# gets rid of punctuation
RUN python -m nltk.downloader punkt

# a test training set: includes the bible, and a stack of other public domain books
RUN python -m nltk.downloader gutenberg

# collection of standard English news articles and research for training models.
RUN python -m nltk.downloader brown

WORKDIR "./"

# then run this:
# docker run -it -p 8888:8888 -v ~/data-science-docker/workbook/:/notebooks/workbook/ julianharris/data-science-stack:latest
	# commands to set up the Docker image
	FROM tensorflow/tensorflow:latest

	# for natural language processing
	RUN pip install nltk

	# general data analysis (statistical: like R for python)
	RUN pip install pandas

	# interesting visualisations
	RUN pip install seaborn

	# special library for NLP: tagging up a sentence with word-to-word dependencies
	# e.g. "he was going to school" -- he referring to school
	#93% accuracy on tagging speech. Google's Parsey McParseface has 94%
	RUN pip install spacy

	# machine learning library comes with library for reinforcement learning
	# method used by DeepMind's AlphaGo.
	RUN pip install pybrain

	# unsupervised clustering using LDA (latent dirichlet allocation) adding
	# semantics to text analysis
	RUN pip install gensim

	# open source data standard
	RUN pip install sframe

	# utility data sets for training models

	# common need is to have bunch of stopwords in a variety of languages.
	RUN python -m nltk.downloader stopwords

	# set of useful groups of words e.g. word stems
	RUN python -m nltk.downloader wordnet

	# gets rid of punctuation
	RUN python -m nltk.downloader punkt

	# a test training set: includes the bible, and a stack of other public domain books
	RUN python -m nltk.downloader gutenberg

	# collection of standard English news articles and research for training models.
	RUN python -m nltk.downloader brown

	WORKDIR "./"

	# then run this:
	# docker run -it -p 8888:8888 -v ~/data-science-docker/workbook/:/notebooks/workbook/ julianharris/data-science-stack:latest