Simone Guardati pistocop

## docker-mysql.sh
docker run -p 3306:3306 --name my-mysql -e MYSQL_ROOT_PASSWORD=my-secret-pw -d mysql:latest

## 1000-most-common-italian-words.txt
come
io
il suo
che
lui
era
per
su
sono
con

## WhatsApp-parsing-example
# Example of two rows from WhatsApp exported data:
# 12/12/19, 02:09 - Alice: bla bla bla bla
# 12/12/19, 08:40 - bob: bla bla bla bla

whatsapp_user_name = "bob" # <--- your name, extracted from Whatsapp data
whatsapp_datetime_format = "%m/%d/%y, %H:%M" # <-- American format used (MDY)
telegram_datetime_format = "%Y-%m-%dT%H:%M:%S" # <-- keep because Telegram don't change

## es2csv.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                pistocop
                / es2csv.md
            
            
              Last active
              October 22, 2020 09:37
            
              
                Elasticsearch to CSV
              
          
    Elasticsearch to CSV

Roadmap:


Dump index from es using elasticdump
Convert to csv using json2csv

(optional)

Extract one column using csvtool
Visualize the data using VisData


## wifi_installer.sh
# Code from stack overflow: https://askubuntu.com/questions/1162223/lenovo-18-04-no-wi-fi-adapter-found
sudo apt-get update && sudo apt-get install build-essential git dkms
git clone https://github.com/tomaspinho/rtl8821ce
cd rtl8821ce
chmod +x dkms-install.sh
chmod +x dkms-remove.sh
sudo ./dkms-install.sh
sudo modprobe 8821ce

## [medium][subreddit-downloader]install.sh
# Init
$ git clone https://github.com/pistocop/subreddit-comments-dl.git
$ cd subreddit-comments-dl
$ pip install -r requirements.txt

# Download the AskReddit comments of the last 30 submissions
$ python src/subreddit_downloader.py AskReddit --batch-size 10 --laps 3 --reddit-id <reddit_id> --reddit-secret <reddit_secret> --reddit-username <reddit_username>

# Download the News comments after 1 January 2021
$ python src/subreddit_downloader.py AskReddit --batch-size 512 --laps 3 --reddit-id <reddit_id> --reddit-secret <reddit_secret> --reddit-username <reddit_username> --utc-after 1609459200

## [medium][subreddit-downloader]simple-scraper.sh
# Init
$ git clone https://github.com/pistocop/subreddit-comments-dl.git
$ cd subreddit-comments-dl
$ pip install -r requirements.txt

# Download the AskReddit comments of the last 30 submissions
$ python src/subreddit_downloader.py AskReddit --batch-size 10 --laps 3 --reddit-id <reddit_id> --reddit-secret <reddit_secret> --reddit-username <reddit_username>

## [medium][subreddit-downloader]example.sh
# Init
$ git clone https://github.com/pistocop/subreddit-comments-dl.git
$ cd subreddit-comments-dl
$ pip install -r requirements.txt

# Download the AskReddit comments of the last 30 submissions
$ python src/subreddit_downloader.py AskReddit --batch-size 10 --laps 3 --reddit-id <reddit_id> --reddit-secret <reddit_secret> --reddit-username <reddit_username>

2021-02-11 19:54:44.175 | INFO     | __main__:main:241 - Start download: UTC range: [None, None], direction: `before`, batch size: 10, total submissions to fetch: 30
2021-02-11 19:54:49.769 | INFO     | codetiming._timer:stop:57 - Lap 0/3 completed in 0.1m | [new/tot]: 0/0

## [medium][subreddit-downloader]dataset-builder.sh
# Build the dataset, the results will be under `./dataset/` path
$ python src/dataset_builder.py
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 84.56it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 348.01it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 963.54it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 23.11it/s]

$ ls dataset/20210211210341
comments.csv  submissions.csv

## [medium][garrascobike] es uploader
# folder garrascobike-core
$ python garrascobike/02_es_uploader.py --es_index my_index \
                                        --es_host http://localhost \
                                        --es_port 9200 \
                                        --input_file ./data/02_entities_extractions/extraction.parquet
	# Example of two rows from WhatsApp exported data:
	# 12/12/19, 02:09 - Alice: bla bla bla bla
	# 12/12/19, 08:40 - bob: bla bla bla bla

	whatsapp_user_name = "bob" # <--- your name, extracted from Whatsapp data
	whatsapp_datetime_format = "%m/%d/%y, %H:%M" # <-- American format used (MDY)
	telegram_datetime_format = "%Y-%m-%dT%H:%M:%S" # <-- keep because Telegram don't change
	# Code from stack overflow: https://askubuntu.com/questions/1162223/lenovo-18-04-no-wi-fi-adapter-found
	sudo apt-get update && sudo apt-get install build-essential git dkms
	git clone https://github.com/tomaspinho/rtl8821ce
	cd rtl8821ce
	chmod +x dkms-install.sh
	chmod +x dkms-remove.sh
	sudo ./dkms-install.sh
	sudo modprobe 8821ce
	# Init
	$ git clone https://github.com/pistocop/subreddit-comments-dl.git
	$ cd subreddit-comments-dl
	$ pip install -r requirements.txt

	# Download the AskReddit comments of the last 30 submissions
	$ python src/subreddit_downloader.py AskReddit --batch-size 10 --laps 3 --reddit-id <reddit_id> --reddit-secret <reddit_secret> --reddit-username <reddit_username>

	# Download the News comments after 1 January 2021
	$ python src/subreddit_downloader.py AskReddit --batch-size 512 --laps 3 --reddit-id <reddit_id> --reddit-secret <reddit_secret> --reddit-username <reddit_username> --utc-after 1609459200
	# Build the dataset, the results will be under `./dataset/` path
	$ python src/dataset_builder.py
	100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 4/4 [00:00<00:00, 84.56it/s]
	100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 16/16 [00:00<00:00, 348.01it/s]
	100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1/1 [00:00<00:00, 963.54it/s]
	100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 3/3 [00:00<00:00, 23.11it/s]

	$ ls dataset/20210211210341
	comments.csv submissions.csv
	# folder garrascobike-core
	$ python garrascobike/02_es_uploader.py --es_index my_index \
	--es_host http://localhost \
	--es_port 9200 \
	--input_file ./data/02_entities_extractions/extraction.parquet