Marco Bonzanini bonzanini

## twitter_most_common_words.py
# Print most common words in a corpus collected from Twitter
#
# Full description:
# http://marcobonzanini.com/2015/03/02/mining-twitter-data-with-python-part-1/
# http://marcobonzanini.com/2015/03/09/mining-twitter-data-with-python-part-2/
# http://marcobonzanini.com/2015/03/17/mining-twitter-data-with-python-part-3-term-frequencies/
#
# Run:
# python twitter_most_common_words.py <filename.jsonl>

## create_recent_articles.sh
curl -XDELETE http://localhost:9200/blog

curl -XPUT http://localhost:9200/blog -d '{
    "settings" : {
        "index" : {
            "number_of_shards" : 1,
            "number_of_replicas" : 0
        }
    },
    "mapping": {

## stem_lemma_pos_nltk_example.py
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()

print("Stem %s: %s" % ("going", stemmer.stem("going")))
print("Stem %s: %s" % ("gone", stemmer.stem("gone")))
print("Stem %s: %s" % ("goes", stemmer.stem("goes")))

## sentiment_classification.py
# You need to install scikit-learn:
# sudo pip install scikit-learn
#
# Dataset: Polarity dataset v2.0
# http://www.cs.cornell.edu/people/pabo/movie-review-data/
#
# Full discussion:
# https://marcobonzanini.wordpress.com/2015/01/19/sentiment-analysis-with-python-and-scikit-learn


## run_luigi.py
# run with a custom --n
# python run_luigi.py SquaredNumbers --local-scheduler --n 20

import luigi

class PrintNumbers(luigi.Task):
    n = luigi.IntParameter(default=10)

    def requires(self):
        return []

## create_proximity.sh
curl -XDELETE http://localhost:9200/test/articles

curl -XPUT http://localhost:9200/test/_mapping/articles -d '{
    "properties": {
        "content": {
            "type":                "string",
            "position_offset_gap": 100
        }
    }
}'

## create_index.sh
curl -XPOST http://localhost:9200/test/articles/1 -d '{
    "content": "The quick brown fox"
}'
curl -XPOST http://localhost:9200/test/articles/2 -d '{
    "content": "What does the fox say?"
}'
curl -XPOST http://localhost:9200/test/articles/3 -d '{
    "content": "The quick brown fox jumped over the lazy dog"
}'
curl -XPOST http://localhost:9200/test/articles/4 -d '{

## search_biopython.py
# This code uses Biopython to retrieve lists of articles from pubmed
# you need to install Biopython first.

# If you use Anaconda:
# conda install biopython

# If you use pip/venv:
# pip install biopython

# Full discussion:

## config.py
consumer_key = 'your-consumer-key'
consumer_secret = 'your-consumer-secret'
access_token = 'your-access-token'
access_secret = 'your-access-secret'
	# Print most common words in a corpus collected from Twitter
	#
	# Full description:
	# http://marcobonzanini.com/2015/03/02/mining-twitter-data-with-python-part-1/
	# http://marcobonzanini.com/2015/03/09/mining-twitter-data-with-python-part-2/
	# http://marcobonzanini.com/2015/03/17/mining-twitter-data-with-python-part-3-term-frequencies/
	#
	# Run:
	# python twitter_most_common_words.py <filename.jsonl>
	curl -XDELETE http://localhost:9200/blog

	curl -XPUT http://localhost:9200/blog -d '{
	"settings" : {
	"index" : {
	"number_of_shards" : 1,
	"number_of_replicas" : 0
	}
	},
	"mapping": {
	from nltk import pos_tag
	from nltk.tokenize import word_tokenize
	from nltk.stem import PorterStemmer, WordNetLemmatizer

	stemmer = PorterStemmer()
	lemmatiser = WordNetLemmatizer()

	print("Stem %s: %s" % ("going", stemmer.stem("going")))
	print("Stem %s: %s" % ("gone", stemmer.stem("gone")))
	print("Stem %s: %s" % ("goes", stemmer.stem("goes")))
	# You need to install scikit-learn:
	# sudo pip install scikit-learn
	#
	# Dataset: Polarity dataset v2.0
	# http://www.cs.cornell.edu/people/pabo/movie-review-data/
	#
	# Full discussion:
	# https://marcobonzanini.wordpress.com/2015/01/19/sentiment-analysis-with-python-and-scikit-learn
	# run with a custom --n
	# python run_luigi.py SquaredNumbers --local-scheduler --n 20

	import luigi

	class PrintNumbers(luigi.Task):
	n = luigi.IntParameter(default=10)

	def requires(self):
	return []
	curl -XDELETE http://localhost:9200/test/articles

	curl -XPUT http://localhost:9200/test/_mapping/articles -d '{
	"properties": {
	"content": {
	"type": "string",
	"position_offset_gap": 100
	}
	}
	}'
	curl -XPOST http://localhost:9200/test/articles/1 -d '{
	"content": "The quick brown fox"
	}'
	curl -XPOST http://localhost:9200/test/articles/2 -d '{
	"content": "What does the fox say?"
	}'
	curl -XPOST http://localhost:9200/test/articles/3 -d '{
	"content": "The quick brown fox jumped over the lazy dog"
	}'
	curl -XPOST http://localhost:9200/test/articles/4 -d '{
	# This code uses Biopython to retrieve lists of articles from pubmed
	# you need to install Biopython first.

	# If you use Anaconda:
	# conda install biopython

	# If you use pip/venv:
	# pip install biopython

	# Full discussion:
	consumer_key = 'your-consumer-key'
	consumer_secret = 'your-consumer-secret'
	access_token = 'your-access-token'
	access_secret = 'your-access-secret'