Skip to content

Instantly share code, notes, and snippets.

View HAKSOAT's full-sized avatar
🏗️
Building information retrieval systems...

Habeeb Shopeju HAKSOAT

🏗️
Building information retrieval systems...
View GitHub Profile
@HAKSOAT
HAKSOAT / text_preprocessing.py
Created April 15, 2023 10:47 — forked from jiahao87/text_preprocessing.py
Full code for preprocessing text
from bs4 import BeautifulSoup
import spacy
import unidecode
from word2number import w2n
import contractions
nlp = spacy.load('en_core_web_md')
# exclude words from spacy stopwords list
deselect_stop_words = ['no', 'not']
import os
from asyncio import create_subprocess_shell, subprocess as aio_subprocess
from datetime import timedelta
SAVE_FORMAT = "{title}-{start}-{duration}.{extension}"
async def aio_exec(command):
@HAKSOAT
HAKSOAT / SearchEngineer.md
Created July 17, 2021 22:11 — forked from morria/SearchEngineer.md
Search Engineer

Search Relevance Engineer

Working with the Search team, you'll be applying your background in Information Retrieval, Machine Learning or Data Mining to run experiments and develop products that have a provable impact on the Etsy marketplace. You'll be analyzing data, understanding language, developing new algorithms and building large-scale distributed systems.

Our team is responsible for creating and optimizing the best experiences for buyers and getting the best performance for sellers. Our work focuses on improvements to search ranking, query understanding, spelling correction, auto completion and query intent recognition.

Requirements

  • Strong background in Machine Learning, Statistics, Information Retrieval
On 8 Jne 1954, Turing's housekeeper found him dead at the age of 41; he had died the previous day.
HAKS
HEKS
HIKS
HOKS
HUKS
HJKS
H1KS
{'article_pid': 18951386, 'wp_templates': ['WikiProject Objectivism', 'WikiProject Novels',
'WikiProject Philosophy', 'WikiProject Libertarianism', 'WikiProject Politics',
'WikiProject Trains'], 'title': 'Atlas Shrugged', 'talk_revid': 911346471,
'taxo_labels': ['Culture.Philosophy and religion', 'Culture.Media.Media*', 'Culture.Media.Books',
'Culture.Literature', 'Culture.Philosophy and religion', 'History and Society.Politics and government',
'History and Society.Politics and government', 'History and Society.Transportation'],
'article_revid': 926765055,
'sitelinks': {'ru': 'Атлант расправил плечи', 'cs': 'Atlasova vzpoura',
'hy': 'Ատլանտը պարզեց թևերը', 'da': 'Og verden skælvede', 'ky': 'Атлант ийиндерин куушурду',
'de': 'Atlas wirft die Welt ab', 'simple': 'Atlas Shrugged', 'sv': 'Och världen skälvde',
# Extracts the text used for the performance test
import time
import mwapi
session = mwapi.Session("https://en.wikipedia.org")
doc = session.get(action='query', prop='revisions', rvprop='content', titles='Alan Turing', formatversion=2)
text = doc['query']['pages'][0]['revisions'][0]['content']
# Functions for tokenization
import json
# Creates an index for the tokenizer
import requests
param = (('v', ''),)
data = r"""{
"settings": {
"index.analyze.max_token_count" : 1000000,
"analysis": {
"analyzer": {
@HAKSOAT
HAKSOAT / Regexes
Created April 24, 2020 14:35
Regexes for the Tokenizer
Python's regex
(?P<comment_start><!--)|(?P<comment_end>-->)|(?P<url>((bitcoin|geo|magnet|mailto|news|sips?|tel|urn)\:|((|ftp|ftps|git|gopher|https?|ircs?|mms|nntp|redis|sftp|ssh|svn|telnet|worldwind|xmpp)\:)?\/\/)[^\s/$.?#].[^\s]*)|(?P<entity>&[a-z][a-z0-9]*;)|(?P<cjk>[\u4E00-\u62FF\u6300-\u77FF\u7800-\u8CFF\u8D00-\u9FCC\u3400-\u4DFF\U00020000-\U000215FF\U00021600-\U000230FF\U00023100-\U000245FF\U00024600-\U000260FF\U00026100-\U000275FF\U00027600-\U000290FF\U00029100-\U0002A6DF\uF900-\uFAFF\U0002F800-\U0002FA1F\u3041-\u3096\u30A0-\u30FF\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A\u2E80-\u2FD5\uFF5F-\uFF9F\u31F0-\u31FF\u3220-\u3243\u3280-\u337F])|(?P<ref_open><ref\b[^>/]*>)|(?P<ref_close></ref\b[^>]*>)|(?P<ref_singleton><ref\b[^>/]*/>)|(?P<tag></?([a-z][a-z0-9]*)\b[^>]*>)|(?P<number>[\d]+)|(?P<japan_punct>[\u3000-\u303F])|(?P<danda>।|॥)|(?P<bold>''')|(?P<italic>'')|(?P<word>([^\W\d]|[\u0901-\u0963\u0601-\u061A\u061C-\u0669\u06D5-\u06EF\u0980-\u09FF])[\w\u0901-\u0963\u0601-\u061A\u061C-\u0669\u06D5-\u06EF\u0980-\u0
{
"tokens": [
{
"token": "As",
"start_offset": 0,
"end_offset": 2,
"type": "word",
"position": 0
},
{
def decorate_text(text):
decoration = "\n\n**********{}**********\n\n"
decorated_text = decoration.format(text)
return decorated_text
# Unpacking of values
def generate_multiplications_1(multiplicand, start, stop):
multiplications = []
for multiplier in range(start, stop + 1):