This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa).-mostly copy paste from huggingface tun_glue""" | |
from __future__ import absolute_import, division, print_function | |
import argparse | |
import glob | |
import logging | |
import os | |
import random |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa).-mostly copy paste from huggingface tun_glue""" | |
from __future__ import absolute_import, division, print_function | |
import argparse | |
import glob | |
import logging | |
import os | |
import random |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
mkdir extract | |
file=$1 | |
src=$2 | |
dst=$3 | |
echo processing $file | |
sudo aws s3 cp s3://aol-on-sandbox/$src/"$file" /home/ubuntu/"$file" | |
youtube-dl -i -f 'bestvideo[height=480]/best' -o 'extract/%(id)s.%(ext)s' --batch-file="$file" | |
tar -zcvf "$file".tar.gz extract | |
sudo aws s3 mv /home/ubuntu/"$file".tar.gz s3://aol-on-sandbox/$dst/"$file".tar.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
mkdir extract | |
mkdir text | |
file=$1 | |
src=$2 | |
dst=$3 | |
echo processing $file | |
aws s3 cp s3://aol-on-sandbox/$src/"$file" /home/ubuntu/"$file" | |
tar -xzf /home/ubuntu/"$file" -C extract | |
python /home/ubuntu/html2text.py /home/ubuntu/extract/root/mkdir\ / /home/ubuntu/text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from readability import Document | |
import os | |
import sys | |
import io | |
for fn in os.listdir(sys.argv[1]): | |
try: | |
with io.open(os.path.join(sys.argv[1], fn), encoding='utf-8') as f: | |
text = f.read() | |
if 'batch_fails' not in fn : | |
doc = Document(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
rm -rf extract | |
rm -rf text | |
while IFS= read -r file; do | |
mkdir extract | |
mkdir text | |
echo processing $file | |
aws s3 cp s3://aol-on-sandbox/googlenews/links/html/$file /home/ubuntu/$file | |
tar -xzf /home/ubuntu/$file -C extract | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
while read line | |
do | |
echo processing "$1" | |
name="${line##*( )}" | |
echo $name | |
aws s3 cp s3://aol-on-sandbox/clickture/bing/features/"'$name'".csv . && cat "'$1'".csv test/$1 > "'$1'".csv.temp && aws s3 cp "'$1'".csv.temp s3://aol-on-sandbox/clickture/bing/features/test/"'$name'".csv | |
done <$1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-7.5/targets/x86_64-linux/lib/ | |
echo processing "$1" | |
mkdir images | |
aws s3 cp s3://aol-on-sandbox/clickture/bing/html/"'$1'".tar.gz . | |
tar -xzf "'$1'".tar.gz -C images | |
rm -f "'$1'".tar.gz | |
gm mogrify -format jpeg images/root/images/*.GIF | |
rm -f images/root/images/*.GIF | |
gm mogrify -format jpeg images/root/images/*.BMP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-7.5/targets/x86_64-linux/lib/ | |
while IFS= read -r file; do | |
echo processing $file | |
name=$(echo $file | cut -f 1 -d '.') | |
mkdir images | |
aws s3 cp s3://aol-on-sandbox/clickture/bing/html/"'$name".tar.gz . | |
tar -xzf "'$name".tar.gz -C images | |
rm -f "'$name".tar.gz | |
gm mogrify -format jpeg images/root/images/*.GIF |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'torch' | |
require 'paths' | |
--print(arg) | |
function string:split( inSplitPattern, outResults ) | |
if not outResults then | |
outResults = {} | |
end | |
local theStart = 1 |
NewerOlder