Skip to content

Instantly share code, notes, and snippets.

import os
import pysrt
def download_youtube_srt(link):
video_id = link[link.find('?v=') + 3:]
import subprocess
subprocess.call(['youtube-dl',
# for sub
'--sub-lang', 'en',
@bowbowbow
bowbowbow / extract_morpheme_from_sejong_corpus.py
Last active December 16, 2019 19:51
세종 코퍼스에서 형태소 분석 데이터 추출
import glob
from bs4 import BeautifulSoup
from tqdm import tqdm
import json
if __name__ == '__main__':
paths = glob.glob('./corpus-utf8/*.txt')
print('len(paths):', len(paths))
pair_count = 0
def load_glove(glove_path):
word2vec = {}
with open(glove_path, 'r') as f:
for line in f:
split_line = line.split()
word = split_line[0]
embedding = np.array([float(val) for val in split_line[1:]])
word2vec[word] = embedding
print("Done.",len(word2vec)," words loaded!")
return word2vec
import os
import time
import sys
import datetime
import random
import json
import pickle
import numpy as np
from flask import Flask, session, g, request, render_template, redirect
from flask_mongoengine import MongoEngine
"settings": {
"index": {
"analysis": {
"analyzer": {
"korean": {
"type": "custom",
"tokenizer": "seunjeon_default_tokenizer",
"min_token_length": 2,
"decompound_mode": "mixed",
"filter": ['cobak_synonym_filter', 'token_length_filter'],
# Convolutional neural network (two convolutional layers)
class ConvNet(nn.Module):
def __init__(self, hidden_size=200):
super(ConvNet, self).__init__()
self.hidden_size = hidden_size
self.layer1 = nn.Sequential(
nn.Conv1d(34, 68, kernel_size=5, stride=1, padding=2),
nn.BatchNorm1d(68),
nn.CELU(),
class BertNet(nn.Module):
def __init__(self, finetuning=False, num_classes=3, hidden_size=50):
super().__init__()
self.bert = BertModel.from_pretrained('bert-base-uncased')
self.bert_output_size = 768
self.hidden_size = hidden_size
self.rnn = nn.LSTM(input_size=self.bert_output_size, hidden_size=self.hidden_size, batch_first=True, bidirectional=True)
self.fc = nn.Linear(self.hidden_size * 2, num_classes)
self.finetuning = finetuning
def draw(epoch, input_x, input_y, predicts, input_c_pos, id2label, id2word):
sents_visual_file = './visualization/{}.html'.format(epoch)
batch_size = len(input_y)
with open(sents_visual_file, "w") as html_file:
html_file.write('<!DOCTYPE html><html lang="ko"><head><meta charset="UTF-8"/></head>')
for i in range(batch_size):
if input_y[i] == predicts[i]: continue
import pymongo
_connection = None
def get_db():
global _connection
if not _connection:
_connection = pymongo.MongoClient('mongodb://127.0.0.1')
return _connection['dbname']
import utils
import nltk, datetime
from pprint import pprint
import spacy
nlp = spacy.load('en_core_web_lg')