Karl Lorey lorey

## scrape_spiegel_online.py
"""
To use this:
pip install requests
pip install --pre mlscraper

To automatically build any scraper, check out https://github.com/lorey/mlscraper
"""

import logging

## rsync-android.sh
# this command will rsync your files via MTP from android to your linux system
# took me a while to find a working combination, so here's the documentation

# 1. plug in phone via USB
# 2. select image or file transfer (image will sync only images, files everything)
# 3. open android in your file system (to make sure it's mounted)
# 4. run the following command
rsync -h --progress --stats -r -tgo -p -l -D --delete "/run/user/1000/gvfs/{insert path here}/" ./{your path without trailing slash}

## selenium_xhr_requests_via_performance_logging.py
#
# This small example shows you how to access JS-based requests via Selenium
# Like this, one can access raw data for scraping,
# for example on many JS-intensive/React-based websites
#

from time import sleep

from selenium import webdriver
from selenium.webdriver import DesiredCapabilities

## block-slack-user.js
//
// This will hide all messages from a specific user in Slack. Enjoy the silence.
//

// get the owner id of a message
// -> loops back through list to find owner
function getOwnerId(i) {
  var current = i
  var sender = current.querySelector(".c-message__sender_link");
  var ownerId = sender ? sender.dataset.messageSender : null;

## delete-files-that-contain-specific-string.sh
# say we want to delete all files that contain the string "trash"
# source: https://stackoverflow.com/a/4529138

# 1) create a file that lists all files to delete
find .cache/ | xargs grep -l "trash" | awk '{print "rm "$1}' > delete.sh

# 2) check for errors and stuff
vim delete.sh

# 3) make the file executable and execute

## pandas-nested-parameters.py
def hierarchical_to_flattened_parameters(parameters_dict):
    """
    Flatten an hierarchical dict to an sklearn parameter set.
    :param parameters_dict: hierarchical dict
    :return: flattened dict
    """
    return json_normalize(parameters_dict, sep='__').to_dict(orient='records')[0]

## avoiding-https-connection-pool-errors.py
# this snippet will deal with errors like HTTPSConnectionPool: Max retries exceeded with url...
# by using a backoff factor
# further reading:
# - docs: https://2.python-requests.org/en/master/user/advanced/#transport-adapters
# - stack overflow issue: https://stackoverflow.com/a/47475019
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry


## basic.py
from pandas.io.json import json_normalize
df = json_normalize(data)

## markdown_to_text.py
from bs4 import BeautifulSoup
from markdown import markdown
import re

def markdown_to_text(markdown_string):
    """ Converts a markdown string to plaintext """

    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(markdown_string)

## postgres_pandas_backup.py
import os

import pandas as pd
import psycopg2
from dotenv import find_dotenv
from dotenv import load_dotenv
from psycopg2.extras import DictCursor


def main():
	"""
	To use this:
	pip install requests
	pip install --pre mlscraper

	To automatically build any scraper, check out https://github.com/lorey/mlscraper
	"""

	import logging
	# this command will rsync your files via MTP from android to your linux system
	# took me a while to find a working combination, so here's the documentation

	# 1. plug in phone via USB
	# 2. select image or file transfer (image will sync only images, files everything)
	# 3. open android in your file system (to make sure it's mounted)
	# 4. run the following command
	rsync -h --progress --stats -r -tgo -p -l -D --delete "/run/user/1000/gvfs/{insert path here}/" ./{your path without trailing slash}
	#
	# This small example shows you how to access JS-based requests via Selenium
	# Like this, one can access raw data for scraping,
	# for example on many JS-intensive/React-based websites
	#

	from time import sleep

	from selenium import webdriver
	from selenium.webdriver import DesiredCapabilities
	//
	// This will hide all messages from a specific user in Slack. Enjoy the silence.
	//

	// get the owner id of a message
	// -> loops back through list to find owner
	function getOwnerId(i) {
	var current = i
	var sender = current.querySelector(".c-message__sender_link");
	var ownerId = sender ? sender.dataset.messageSender : null;
	def hierarchical_to_flattened_parameters(parameters_dict):
	"""
	Flatten an hierarchical dict to an sklearn parameter set.
	:param parameters_dict: hierarchical dict
	:return: flattened dict
	"""
	return json_normalize(parameters_dict, sep='__').to_dict(orient='records')[0]
	# this snippet will deal with errors like HTTPSConnectionPool: Max retries exceeded with url...
	# by using a backoff factor
	# further reading:
	# - docs: https://2.python-requests.org/en/master/user/advanced/#transport-adapters
	# - stack overflow issue: https://stackoverflow.com/a/47475019
	import requests
	from requests.adapters import HTTPAdapter
	from requests.packages.urllib3.util.retry import Retry
	from pandas.io.json import json_normalize
	df = json_normalize(data)
	from bs4 import BeautifulSoup
	from markdown import markdown
	import re

	def markdown_to_text(markdown_string):
	""" Converts a markdown string to plaintext """

	# md -> html -> text since BeautifulSoup can extract text cleanly
	html = markdown(markdown_string)
	import os

	import pandas as pd
	import psycopg2
	from dotenv import find_dotenv
	from dotenv import load_dotenv
	from psycopg2.extras import DictCursor


	def main():