Last active
February 3, 2016 07:51
-
-
Save hongkongkiwi/a0115757ea53aefd2a93 to your computer and use it in GitHub Desktop.
This is a simple script to import many (millions!) of entries from a blacklist file into redis. It first generates the file using redis mass import format, then it pipes that into redis for super fast importing.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
FILE="CATEGORIES" | |
REDIS_KEY="categories" | |
while read line; do | |
category=`echo "$line" | cut -f1 -d'-'` | |
description=`echo "$line" | cut -f2 -d'-'` | |
redis-cli hset "$REDIS_KEY" "$category" "$description" | |
done < "$FILE" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
REDIS="redis-cli" | |
OUTPUT_DIR="redis_out" | |
BASE_DIR="." | |
success_count=0 | |
category_count=`find "$BASE_DIR" -maxdepth 2 -name "domains" -type f -exec echo {} \; | wc -l` | |
function process_file() { | |
trim() { | |
local var="$*" | |
var="${var#"${var%%[![:space:]]*}"}" # remove leading whitespace characters | |
var="${var%"${var##*[![:space:]]}"}" # remove trailing whitespace characters | |
echo -n "$var" | |
} | |
INPUT_FILE=$1 | |
OUTPUT_DIR=$2 | |
CATEGORY=`dirname "$INPUT_FILE"` | |
CATEGORY=`basename "$CATEGORY"` | |
OUTPUT_FILE="$OUTPUT_DIR/${CATEGORY}.redis" | |
[ -f "$OUTPUT_FILE" ] && rm "$OUTPUT_FILE" | |
counter=0 | |
start=`date +%s` | |
while read domain; do | |
domain=`trim "$domain"` | |
key="domain:${domain}:categories" | |
value="${CATEGORY}" | |
printf "*3\r\n\$4\r\nSADD\r\n\$${#key}\r\n${key}\r\n\$${#value}\r\n${value}\r\n" >> "$OUTPUT_FILE" | |
printf "*3\r\n\$4\r\nSADD\r\n\$7\r\ndomains\r\n\$${#domain}\r\n${domain}\r\n" >> "$OUTPUT_FILE" | |
counter=$[$counter +1] | |
done < "$INPUT_FILE" | |
end=`date +%s` | |
runtime=$((end-start)) | |
echo "-> Processed $counter entries from $CATEGORY in $runtime secs" | |
} | |
echo "Found $category_count categories to import" | |
export -f process_file | |
export success_count | |
files=`find "$BASE_DIR" -maxdepth 2 -name "domains" -type f | head -n 10` | |
echo "$files" | parallel -j 3 --no-notice process_file '{}' "$OUTPUT_DIR" | |
#size=$(( $( stat -c '%s' $MASS_INSERT_FILE ) / 1024 / 1024 )) | |
echo "Successfully generated all entries." | |
echo "Finding all output files and importing into redis" | |
find "$OUTPUT_DIR" -maxdepth 1 -name "*.redis" | xargs -I {} sh -c "cat '{}' | redis-cli --pipe" | |
echo "Completed all imports!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment