Skip to content

Instantly share code, notes, and snippets.

View snakers4's full-sized avatar
🚀
It is by will alone I set my mind in motion.

Alexander Veysov snakers4

🚀
It is by will alone I set my mind in motion.
View GitHub Profile
@snakers4
snakers4 / parse_cc_index.py
Last active September 14, 2023 20:00
Plain common crawl pre-processing
import gc
import gzip
import time
import json
import shutil
import os,sys
import tldextract
import collections
import pandas as pd
from tqdm import tqdm
{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"id":"vA0-1iElD-wr","executionInfo":{"status":"ok","timestamp":1681303032533,"user_tz":-180,"elapsed":10963,"user":{"displayName":"Senhor Maestro","userId":"08176940519269874318"}}},"outputs":[],"source":["import numpy as np\n","import pandas as pd\n","import torch\n","\n","import seaborn as sns\n","import matplotlib.pyplot as plt\n","\n","from sklearn.model_selection import train_test_split\n","from sklearn.feature_selection import mutual_info_regression\n","from sklearn.metrics import accuracy_score\n","#from catboost import CatBoostClassifier"]},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"2rjmy9OpEYEr","executionInfo":{"status":"ok","timestamp":1681303100348,"user_tz":-180,"elapsed":19790,"user":{"displayName":"Senhor Maestro","userId":"08176940519269874318"}},"outputId":"cac0e451-2165-406b-cba1-f8c3bf6d480f"},"execution_count":
{
"cells": [
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"id": "1XEMm5oo36Sm"
},
"outputs": [],
"source": [
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@snakers4
snakers4 / Loss.py
Created July 21, 2018 11:02
Multi class classification focal loss
import torch
import torch.nn as nn
import torch.nn.functional as F
# Focal loss implementation inspired by
# https://github.com/c0nn3r/RetinaNet/blob/master/focal_loss.py
# https://github.com/doiken23/pytorch_toolbox/blob/master/focalloss2d.py
class MultiClassBCELoss(nn.Module):
def __init__(self,
use_weight_mask=False,
@snakers4
snakers4 / process_wikipedia.py
Last active January 4, 2023 22:19
Post process wikipedia files produced by wikiextractor
import os
import re
import sys
import glob
import nltk
import gensim
import numpy as np
import pandas as pd
from tqdm import tqdm
from uuid import uuid4
@snakers4
snakers4 / parse_cc_index.py
Last active October 12, 2022 10:43
Plain scripts to parse Common Crawl
import gc
import gzip
import time
import json
import shutil
import os,sys
import tldextract
import collections
import pandas as pd
from tqdm import tqdm
# Unsophisticated corr analysis to deal w variable bias
data_corr = sDf.corr()
size = data_corr.shape[0] - 1
# Set the threshold to select only highly correlated attributes
threshold = 0.5
# List of pairs along with correlation above threshold
corr_list = []
@snakers4
snakers4 / abstractions.py
Last active May 27, 2022 07:12
My XGB boilerplate
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
#Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics #Additional scklearn functions
from sklearn.grid_search import GridSearchCV #Perforing grid search
import matplotlib.pylab as plt
@snakers4
snakers4 / examples.md
Created October 6, 2021 15:58
Text Enhancement Examples
Original Model
She heard Missis Gibson talking on in a sweet monotone, and wished to attend to what she was saying, but the Squires visible annoyance struck sharper on her mind. She heard Missis Gibson talking on in a sweet monotone and wished to attend to what she was saying, but the squires visible ann