Skip to content

Instantly share code, notes, and snippets.

@nguyenvulebinh
nguyenvulebinh / wav2vec2_vlsp_test.ipynb
Created May 27, 2023 04:49
wav2vec2_vlsp_test.ipynb
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@nguyenvulebinh
nguyenvulebinh / signal_processing.py
Last active May 17, 2023 08:21
signal processing utils
import torch
import math
from packaging import version
import random
def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
"""Compute amplitude of a batch of waveforms.
Arguments
@nguyenvulebinh
nguyenvulebinh / clean_envi_doc.py
Last active April 16, 2022 07:12
Remove unk chars in English and Vietnamese document
import re
CHARACTERS = "0123456789aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍị" \
"ỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ"
PUNCTUATION = ".,?!@%~`#$^&*()-_+=[]{}\|:;\"'<>/"
ALL_CHARS = CHARACTERS + PUNCTUATION
WORD_NORMALIZER = re.compile(r"[^ {}]".format(re.escape(ALL_CHARS)))
def remove_unk_char(text):
return WORD_NORMALIZER.sub(' ', text)
@nguyenvulebinh
nguyenvulebinh / parser_wiki_dump.py
Created January 3, 2020 02:29
Parser text from Wikipedia dump file
import xml.sax
import os
import re
import subprocess
class WikiXmlHandler(xml.sax.handler.ContentHandler):
"""Content handler for Wiki XML data using SAX"""
@staticmethod
@nguyenvulebinh
nguyenvulebinh / format_tone.py
Last active July 31, 2019 09:37
Format Tone for Vietnamese Sentences
import ftfy
import bogo
import re
map_char = {
"à": ["a", "f"], "á": ["a", "s"], "â": ["aa", ""], "ã": ["a", "x"], "è": ["e", "f"], "é": ["e", "s"],
"ê": ["ee", ""], "ì": ["i", "f"], "í": ["i", "s"], "ò": ["o", "f"], "ó": ["o", "s"], "ô": ["oo", ""],
"õ": ["o", "x"], "ù": ["u", "f"], "ú": ["u", "s"], "ý": ["y", "s"], "ă": ["aw", ""], "ĩ": ["i", "x"],
"ũ": ["u", "x"], "ơ": ["ow", ""], "ư": ["uw", ""], "ạ": ["a", "j"], "ả": ["a", "r"], "ấ": ["aa", "s"],
"ầ": ["aa", "f"], "ẩ": ["aa", "r"], "ẫ": ["aa", "x"], "ậ": ["aa", "j"], "ắ": ["aw", "s"], "ằ": ["aw", "f"],
@nguyenvulebinh
nguyenvulebinh / analogies.py
Created May 18, 2019 10:16
Analogies task using word embeddings and annoy library
import numpy as np
from annoy import AnnoyIndex
class PretrainedEmbeddings(object):
def __init__(self, word_to_index, word_vectors):
"""
Args
:param word_to_index: dict mapping from word to integers
:param word_vectors: list of numpy arrays
@nguyenvulebinh
nguyenvulebinh / flatten_all_spark_schema.py
Last active August 8, 2023 15:08
Flatten a Spark DataFrame schema (include struct and array type)
import typing as T
import cytoolz.curried as tz
import pyspark
from pyspark.sql.functions import explode
def schema_to_columns(schema: pyspark.sql.types.StructType) -> T.List[T.List[str]]:
columns = list()