Roadmap:
- Dump index from es using elasticdump
- Convert to csv using json2csv
(optional) - Extract one column using csvtool
- Visualize the data using VisData
docker run -p 3306:3306 --name my-mysql -e MYSQL_ROOT_PASSWORD=my-secret-pw -d mysql:latest |
come | |
io | |
il suo | |
che | |
lui | |
era | |
per | |
su | |
sono | |
con |
# Example of two rows from WhatsApp exported data: | |
# 12/12/19, 02:09 - Alice: bla bla bla bla | |
# 12/12/19, 08:40 - bob: bla bla bla bla | |
whatsapp_user_name = "bob" # <--- your name, extracted from Whatsapp data | |
whatsapp_datetime_format = "%m/%d/%y, %H:%M" # <-- American format used (MDY) | |
telegram_datetime_format = "%Y-%m-%dT%H:%M:%S" # <-- keep because Telegram don't change |
Roadmap:
# Code from stack overflow: https://askubuntu.com/questions/1162223/lenovo-18-04-no-wi-fi-adapter-found | |
sudo apt-get update && sudo apt-get install build-essential git dkms | |
git clone https://github.com/tomaspinho/rtl8821ce | |
cd rtl8821ce | |
chmod +x dkms-install.sh | |
chmod +x dkms-remove.sh | |
sudo ./dkms-install.sh | |
sudo modprobe 8821ce |
# Init | |
$ git clone https://github.com/pistocop/subreddit-comments-dl.git | |
$ cd subreddit-comments-dl | |
$ pip install -r requirements.txt | |
# Download the AskReddit comments of the last 30 submissions | |
$ python src/subreddit_downloader.py AskReddit --batch-size 10 --laps 3 --reddit-id <reddit_id> --reddit-secret <reddit_secret> --reddit-username <reddit_username> | |
# Download the News comments after 1 January 2021 | |
$ python src/subreddit_downloader.py AskReddit --batch-size 512 --laps 3 --reddit-id <reddit_id> --reddit-secret <reddit_secret> --reddit-username <reddit_username> --utc-after 1609459200 |
# Init | |
$ git clone https://github.com/pistocop/subreddit-comments-dl.git | |
$ cd subreddit-comments-dl | |
$ pip install -r requirements.txt | |
# Download the AskReddit comments of the last 30 submissions | |
$ python src/subreddit_downloader.py AskReddit --batch-size 10 --laps 3 --reddit-id <reddit_id> --reddit-secret <reddit_secret> --reddit-username <reddit_username> |
# Init | |
$ git clone https://github.com/pistocop/subreddit-comments-dl.git | |
$ cd subreddit-comments-dl | |
$ pip install -r requirements.txt | |
# Download the AskReddit comments of the last 30 submissions | |
$ python src/subreddit_downloader.py AskReddit --batch-size 10 --laps 3 --reddit-id <reddit_id> --reddit-secret <reddit_secret> --reddit-username <reddit_username> | |
2021-02-11 19:54:44.175 | INFO | __main__:main:241 - Start download: UTC range: [None, None], direction: `before`, batch size: 10, total submissions to fetch: 30 | |
2021-02-11 19:54:49.769 | INFO | codetiming._timer:stop:57 - Lap 0/3 completed in 0.1m | [new/tot]: 0/0 |
# Build the dataset, the results will be under `./dataset/` path | |
$ python src/dataset_builder.py | |
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 84.56it/s] | |
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 348.01it/s] | |
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 963.54it/s] | |
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 23.11it/s] | |
$ ls dataset/20210211210341 | |
comments.csv submissions.csv |
# folder garrascobike-core | |
$ python garrascobike/02_es_uploader.py --es_index my_index \ | |
--es_host http://localhost \ | |
--es_port 9200 \ | |
--input_file ./data/02_entities_extractions/extraction.parquet |