Taein KIM sappho192

## pyannote_sd_evaluate.py
AUDIO_FILE = "sample.wav"
REFERENCE = "sample.rttm"

# Clone the https://github.com/pyannote/pyannote-audio and change the path correctly
ROOT_DIR = "D:/REPO/pyannote-audio"
AUDIO_FILE = f"{ROOT_DIR}/tutorials/assets/sample.wav"
REFERENCE = f"{ROOT_DIR}/tutorials/assets/sample.rttm"

from huggingface_hub import HfApi
available_pipelines = [p.modelId for p in HfApi().list_models(filter="pyannote-audio-pipeline")]

## MainWindow.xaml
<Window x:Class="WpfApp1.MainWindow"
        xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
        xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
        xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
        xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
        xmlns:local="clr-namespace:WpfApp1"
        mc:Ignorable="d"
        Title="MainWindow" Height="450" Width="800">
    <Grid>
        <FlowDocumentScrollViewer Name="ChatPanel" Block.LineHeight="2">

## config.optuna.template.yaml
---
# Config for NeMo parameter optimization using Optuna and Comet ML.
optuna:
  study_name: "<STUDY_NAME>"
  db:
    type: postgres
    psql:
      host: "<PSQL_HOST>"
      port: 5432
      table: "<OPTUNA_TABLE>"

## example.py
from comet_ml import Experiment
import optuna

comet_api_key = '<API_KEY>'
psql_id = "<PSQL_ID>"
psql_pw = "<PSQL_PW>"
psql_host = "<PSQL_HOST>"
psql_port = 5432
psql_table = "optuna"
project_name = "<PROJECT_NAME>"

## infer.py
from transformers import AutoTokenizer, AutoModelForCausalLM, GPTJConfig, GPTJForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("d:/MODEL/DaramGPT")
model = GPTJForCausalLM.from_pretrained("d:/MODEL/DaramGPT")

tokens = tokenizer("서울의 날씨는 ")

print(tokenizer.decode(model.generate(**{
    "input_ids": torch.tensor([tokens["input_ids"]]),

## NOTES.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                sappho192
                / NOTES.md
            
            
              Created
              January 8, 2024 22:16
            
              
                Notes on training with Marian MT & Tatoeba dataset
              
          
    한국어 Windows OS에서 Marian 기반 일본어 → 한국어 번역 모델을 만들면서 겪은 일들을 남겨둡니다.
Tatoeba 데이터셋

내가 썼던 데이터셋에는 각종 보이지않는 유니코드 문자들이 텍스트에 섞여있어서 데이터 전처리와 훈련을 망치게 했었다.

그래서 대충 이런식으로 필요없는 공백과 문제를 일으키는 특문들을 정리했었다.

tp.source = line.rstrip().replace("\u200B", "").replace("\u2028","").replace("\u2029","")
특히 HelsinkiNLP/tatoeba 에서 공개한 데이터셋은 윈도우에서 작업된 텍스트 파일이어서인지 줄바꿈 문자가 리눅스와 다르기 때문에, dos2unix로 줄바꿈 문자를 한번 정리해줘야한다.

그러지 않고 리눅스에서 작업한 다른 데이터셋과 병합하면 문제를 일으킬 수 있으니 주의할 것.
컴파일


## infer_onnx.py
# pip install transformers, optimum, onnx, onnxruntime, fugashi, unidic-lite
from transformers import BertJapaneseTokenizer,PreTrainedTokenizerFast
from optimum.onnxruntime import ORTModelForSeq2SeqLM

encoder_model_name = "cl-tohoku/bert-base-japanese-v2"
decoder_model_name = "skt/kogpt2-base-v2"

# using local tokenizer
# encoder_model_name = "./src_tokenizer"
# decoder_model_name = "./trg_tokenizer"

## Thrower.cs
// from https://forum.dotnetdev.kr/t/c-10-null-check-7/7069/2

public static class Thrower
{
	public static Exception ThrowIfFailedValidation<T>(T target, Func<T, bool> validation)
	{
		if (validation(target) is false)
		{
			throw new ValidationFailedException();
		}

## RunMission.java
/* I've changed official RunMission.java in MAVSDK-Java (v1.3.1) example a bit
*  because the example is just uploading & downloading mission.
*  The same example in C++ SDK demonstrates not only uploading but executing mission so I applied similar logic to this Java example code.
*/

package io.mavsdk.example;

import io.mavsdk.System;
import io.mavsdk.mission.Mission;
import io.mavsdk.telemetry.Telemetry;

## MainWindow.xaml
<Window x:Class="WebcamCaptureApp.MainWindow"
        xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
        xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
        xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
        xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
        xmlns:local="clr-namespace:WebcamCaptureApp"
        mc:Ignorable="d"
        Title="MainWindow" Height="900" Width="1600">
    <Grid>
        <StackPanel Orientation="Vertical">
	AUDIO_FILE = "sample.wav"
	REFERENCE = "sample.rttm"

	# Clone the https://github.com/pyannote/pyannote-audio and change the path correctly
	ROOT_DIR = "D:/REPO/pyannote-audio"
	AUDIO_FILE = f"{ROOT_DIR}/tutorials/assets/sample.wav"
	REFERENCE = f"{ROOT_DIR}/tutorials/assets/sample.rttm"

	from huggingface_hub import HfApi
	available_pipelines = [p.modelId for p in HfApi().list_models(filter="pyannote-audio-pipeline")]
	<Window x:Class="WpfApp1.MainWindow"
	xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
	xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
	xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
	xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
	xmlns:local="clr-namespace:WpfApp1"
	mc:Ignorable="d"
	Title="MainWindow" Height="450" Width="800">
	<Grid>
	<FlowDocumentScrollViewer Name="ChatPanel" Block.LineHeight="2">
	---
	# Config for NeMo parameter optimization using Optuna and Comet ML.
	optuna:
	study_name: "<STUDY_NAME>"
	db:
	type: postgres
	psql:
	host: "<PSQL_HOST>"
	port: 5432
	table: "<OPTUNA_TABLE>"
	from comet_ml import Experiment
	import optuna

	comet_api_key = '<API_KEY>'
	psql_id = "<PSQL_ID>"
	psql_pw = "<PSQL_PW>"
	psql_host = "<PSQL_HOST>"
	psql_port = 5432
	psql_table = "optuna"
	project_name = "<PROJECT_NAME>"
	from transformers import AutoTokenizer, AutoModelForCausalLM, GPTJConfig, GPTJForCausalLM
	import torch

	tokenizer = AutoTokenizer.from_pretrained("d:/MODEL/DaramGPT")
	model = GPTJForCausalLM.from_pretrained("d:/MODEL/DaramGPT")

	tokens = tokenizer("서울의 날씨는 ")

	print(tokenizer.decode(model.generate(**{
	"input_ids": torch.tensor([tokens["input_ids"]]),
	# pip install transformers, optimum, onnx, onnxruntime, fugashi, unidic-lite
	from transformers import BertJapaneseTokenizer,PreTrainedTokenizerFast
	from optimum.onnxruntime import ORTModelForSeq2SeqLM

	encoder_model_name = "cl-tohoku/bert-base-japanese-v2"
	decoder_model_name = "skt/kogpt2-base-v2"

	# using local tokenizer
	# encoder_model_name = "./src_tokenizer"
	# decoder_model_name = "./trg_tokenizer"
	// from https://forum.dotnetdev.kr/t/c-10-null-check-7/7069/2

	public static class Thrower
	{
	public static Exception ThrowIfFailedValidation<T>(T target, Func<T, bool> validation)
	{
	if (validation(target) is false)
	{
	throw new ValidationFailedException();
	}
	/* I've changed official RunMission.java in MAVSDK-Java (v1.3.1) example a bit
	* because the example is just uploading & downloading mission.
	* The same example in C++ SDK demonstrates not only uploading but executing mission so I applied similar logic to this Java example code.
	*/

	package io.mavsdk.example;

	import io.mavsdk.System;
	import io.mavsdk.mission.Mission;
	import io.mavsdk.telemetry.Telemetry;
	<Window x:Class="WebcamCaptureApp.MainWindow"
	xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
	xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
	xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
	xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
	xmlns:local="clr-namespace:WebcamCaptureApp"
	mc:Ignorable="d"
	Title="MainWindow" Height="900" Width="1600">
	<Grid>
	<StackPanel Orientation="Vertical">