Skip to content

Instantly share code, notes, and snippets.

@qkdxorjs1002
Last active January 24, 2024 01:35
Show Gist options
  • Save qkdxorjs1002/76225120255c6f79857e2b012045c5bb to your computer and use it in GitHub Desktop.
Save qkdxorjs1002/76225120255c6f79857e2b012045c5bb to your computer and use it in GitHub Desktop.
NIA 음성수집 가공 데이터 다양성 통계 스크립트
import json
import os
from decimal import Decimal
import time
from datetime import datetime
import sys
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))
from utils.slackpushbot import SlackPushBot
### 경로에서 역슬래시('\')는 두번 입력해주세요
### 예시) = "C:\\NIA_COMPLETE_labeling\\call"
## 화자 다양성 통계
# python -c "import statistic; statistic.calculate('C:\\NIA_COMPLETE_labeling\\call')"
## 아동 다양성 통계
# python -c "import statistic; statistic.calculateChild('C:\\NIACHILD_COMPLETE_labeling\\eng_formatted')"
### 디버깅
IS_DEBUGGING = False
### 기기 별명 (환경변수)
if not IS_DEBUGGING:
deviceAlias = os.environ["NIA_DEVICE_ALIAS"]
class DataModel:
def __init__(self) -> None:
### 성별 통계
self.statsOfGender = {
"Male": Decimal(0.0), ## 남성
"Female": Decimal(0.0) ## 여성
}
### 연령 통계
self.statsOfAge = {
"~19": Decimal(0.0),
"20~59": Decimal(0.0),
"60~": Decimal(0.0),
"etc.": Decimal(0.0)
}
### 지역 통계
self.statsOfRegion = {
"00": Decimal(0.0), ## 서울
"01": Decimal(0.0), ## 인천/경기
"02": Decimal(0.0), ## 부산/대구/울산/경상
"03": Decimal(0.0), ## 광주/전라/제주
"04": Decimal(0.0), ## 대전/세종/충청/강원
"05": Decimal(0.0) ## 기타
}
### 문화 통계
self.statsOfCulture = {
"다문화": Decimal(0.0),
"한문화": Decimal(0.0)
}
### 유형 통계
self.statsOfType = { ## wakeUpWord : 호출어
"00": Decimal(0.0), # 호출어-평이하게
"01": Decimal(0.0), # 호출어-빠르게
"02": Decimal(0.0), # 호출어-느리게
"03": Decimal(0.0), # 호출어-약간 크게
"04": Decimal(0.0), # 호출어-작게
"commonSentence": Decimal(0.0), # 공통문장
"randomText": Decimal(0.0), # 랜덤문장
"continuous": Decimal(0.0) # 연속발화
}
### 환경 통계
self.statsOfEnv = {
"Studio": Decimal(0.0),
"Noise": Decimal(0.0),
"Clean": Decimal(0.0)
}
class ChildDataModel:
def __init__(self) -> None:
self.statsOfGender = {
"M": Decimal(0.0), # 남성
"F": Decimal(0.0) # 여성
}
### 연령층 통계
self.statsOfAge = {
"G01": Decimal(0.0), # 미취학
"G02": Decimal(0.0), # 저학년
"G03": Decimal(0.0) # 고학년
}
### 환경 통계
self.statsOfEnv = {
"Silence": Decimal(0.0), # 저소음
"Noise": Decimal(0.0), # 소음
"N/A": Decimal(0.0)
}
### 대본 유형 통계
self.statsOfType = {
"Formatted": Decimal(0.0), # 정형
"Free": Decimal(0.0) # 비정형
}
def convertSecondsToHours(seconds):
return round(seconds / 3600, 3)
def printWithWriteLog(text, fileHandler):
print(text)
fileHandler.write(str(text) + "\n")
### 집계
def calculate(targetPath):
numberOfJSON = 0
numberOfInvalidJSON = 0
stats = DataModel()
# 시작 시간 기록
startTime = time.time()
if not IS_DEBUGGING:
slackPushBot = SlackPushBot("xoxb-2118766093648-2927439136034-fRQ2wim8rdP5MCxqA4cBTD1F", "C02TBMZ2R4L")
slackPushBot.postMessage(":round_pushpin: [" + deviceAlias + "] -- 다양성 통계(화자) 집계 시작\n" + targetPath)
exceptionLogPath = datetime.now().strftime("%Y-%m-%d_%H.%M.%S") + ".log"
exceptionLogHandler = open(exceptionLogPath, 'a', encoding="utf-8")
beforePath = ""
for path, dirs, files in os.walk(targetPath):
files = [ fi for fi in files if fi.endswith(".json")]
for file in files:
jsonPath = path + "/" + file
if beforePath != path:
print(path)
beforePath = path
with open(jsonPath, 'r', encoding='UTF8') as jsonFile:
### Parsing JSON
try:
jsonData = json.load(jsonFile)
if json.dumps(jsonData).find("null") > 0:
raise Exception("JSON File has 'null'")
fileLength = Decimal(jsonData['File']['FileLength'])
numberOfJSON += 1
dataCategory = jsonData["Basic"]["DataCategory"]
### Counting Stats
# 성별
stats.statsOfGender[jsonData["Speaker"]["Gender"]] += fileLength
# 나이
stats.statsOfAge[jsonData["Speaker"]["Age"]] += fileLength
# 지역
stats.statsOfRegion[jsonData["Speaker"]["Region"]] += fileLength
# 다문화
if int(jsonData["Basic"]["NumberOfSpeaker"]) >= 9000:
stats.statsOfCulture["다문화"] += fileLength
else:
stats.statsOfCulture["한문화"] += fileLength
# 유형
if dataCategory == "wakeUpWord":
stats.statsOfType[jsonData["Environment"]["Style"]] += fileLength
else:
stats.statsOfType[dataCategory] += fileLength
# 환경
stats.statsOfEnv[jsonData["Environment"]["RecordingEnviron"]] += fileLength
except Exception as e:
# 파싱 예외 및 오디오 길이 N/A일 경우 예외 로그에 기록
numberOfInvalidJSON += 1
exceptionLogHandler.write(jsonPath + "\n err: " + str(e) + "\n")
exceptionLogHandler.close()
resultLogPath = "result_" + datetime.now().strftime("%Y-%m-%d_%H.%M.%S") + ".log"
resultLogHandler = open(resultLogPath, "a", encoding="utf-8")
printWithWriteLog("✅ Job is done.\n", resultLogHandler)
printWithWriteLog("🔴 Number of JSON: " + str(numberOfJSON), resultLogHandler)
printWithWriteLog("🔴 Number of Invalid JSON: " + str(numberOfInvalidJSON), resultLogHandler)
if numberOfInvalidJSON > 0:
printWithWriteLog("🔴 Execption Log: " + exceptionLogPath, resultLogHandler)
if not IS_DEBUGGING:
resp = slackPushBot.postMessage(":red_circle: [" + deviceAlias + "] -- 다양성 통계(화자) 집계 중 예외 발생\nJSON "+ str(numberOfInvalidJSON) + "건 발생")
exceptionLogHandler = open(exceptionLogPath, 'r', encoding="utf-8")
slackPushBot.postMessage("❗ 예외\n\n```" + exceptionLogHandler.read() + "```", resp["ts"])
exceptionLogHandler.close()
printWithWriteLog("🔴 Running Time:" + str(round(time.time() - startTime, 2)) +"\bs", resultLogHandler)
## Time Statstics (second)
printWithWriteLog("\n✅ Statstics", resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfGender["Male"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfGender["Female"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfAge["~19"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfAge["20~59"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfAge["60~"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfRegion["00"] + stats.statsOfRegion["01"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfRegion["04"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfRegion["03"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfRegion["02"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfCulture["한문화"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfCulture["다문화"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfType["00"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfType["02"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfType["01"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfType["03"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfType["04"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfType["commonSentence"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfType["randomText"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfType["continuous"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfEnv["Studio"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfEnv["Clean"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats.statsOfEnv["Noise"]), resultLogHandler)
resultLogHandler.close()
if not IS_DEBUGGING:
resp = slackPushBot.postMessage(":black_square_for_stop: [" + deviceAlias + "] -- 다양성 통계(화자) 집계 종료\n" + targetPath)
resultLogHandler = open(resultLogPath, "r", encoding="utf-8")
slackPushBot.postMessage("🆗 결과\n\n```" + resultLogHandler.read() + "```", resp["ts"])
resultLogHandler.close()
### 아동 집계
def calculateChild(targetPath):
numberOfJSON = 0
numberOfInvalidJSON = 0
stats = {
### 국문
"KOR": ChildDataModel(),
### 영문
"ENG": ChildDataModel()
}
# 시작 시간 기록
startTime = time.time()
if not IS_DEBUGGING:
slackPushBot = SlackPushBot("xoxb-2118766093648-2927439136034-fRQ2wim8rdP5MCxqA4cBTD1F", "C02TBMZ2R4L")
slackPushBot.postMessage(":round_pushpin: [" + deviceAlias + "] -- 다양성 통계(아동) 집계 시작\n" + targetPath)
exceptionLogPath = "exception_" + datetime.now().strftime("%Y-%m-%d_%H.%M.%S") + ".log"
exceptionLogHandler = open(exceptionLogPath, 'a', encoding="utf-8")
beforePath = ""
for path, dirs, files in os.walk(targetPath):
files = [ fi for fi in files if fi.endswith(".json")]
for file in files:
jsonPath = path + "/" + file
if beforePath != path:
print(path)
beforePath = path
with open(jsonPath, 'r', encoding='UTF8') as jsonFile:
### Parsing JSON
try:
jsonData = json.load(jsonFile)
if json.dumps(jsonData).find("null") > 0:
raise Exception("JSON File has 'null'")
fileLength = Decimal(jsonData['File']['FileLength'])
numberOfJSON += 1
language = jsonData["Basic"]["Language"]
### Counting Stats
# 성별
stats[language].statsOfGender[jsonData["Speaker"]["Gender"]] += fileLength
# 나이
stats[language].statsOfAge[jsonData["Speaker"]["AgeGroup"]] += fileLength
# 환경
stats[language].statsOfEnv[jsonData["Environment"]["NoiseEnviron"]] += fileLength
# 대본 유형
stats[language].statsOfType[jsonData["Basic"]["DataCategory"]] += fileLength
except Exception as e:
# 파싱 예외 및 오디오 길이 N/A일 경우 예외 로그에 기록
numberOfInvalidJSON += 1
exceptionLogHandler.write(jsonPath + "\n err: " + str(e) + "\n")
exceptionLogHandler.close()
resultLogPath = "result_" + datetime.now().strftime("%Y-%m-%d_%H.%M.%S") + ".log"
resultLogHandler = open(resultLogPath, 'a', encoding="utf-8")
printWithWriteLog("✅ Job is done.\n", resultLogHandler)
printWithWriteLog("🔴 Number of JSON: " + str(numberOfJSON), resultLogHandler)
printWithWriteLog("🔴 Number of Invalid JSON: " + str(numberOfInvalidJSON), resultLogHandler)
if numberOfInvalidJSON > 0:
printWithWriteLog("🔴 Execption Log: " + exceptionLogPath, resultLogHandler)
if not IS_DEBUGGING:
resp = slackPushBot.postMessage(":red_circle: [" + deviceAlias + "] -- 다양성 통계(아동) 집계 중 예외 발생\nJSON "+ str(numberOfInvalidJSON) + "건 발생")
exceptionLogHandler = open(exceptionLogPath, 'r', encoding="utf-8")
slackPushBot.postMessage("❗ 예외\n\n```" + exceptionLogHandler.read() + "```", resp["ts"])
exceptionLogHandler.close()
printWithWriteLog("🔴 Running Time:" + str(round(time.time() - startTime, 2)) +"\bs", resultLogHandler)
## Time Statstics (second)
printWithWriteLog("\n✅ Statstics", resultLogHandler)
for lang in stats.keys():
printWithWriteLog("\n🆗 " + lang, resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats[lang].statsOfGender["M"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats[lang].statsOfGender["F"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats[lang].statsOfAge["G01"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats[lang].statsOfAge["G02"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats[lang].statsOfAge["G03"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats[lang].statsOfEnv["Silence"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats[lang].statsOfEnv["Noise"]), resultLogHandler)
if lang == "ENG":
printWithWriteLog(convertSecondsToHours(stats[lang].statsOfEnv["N/A"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats[lang].statsOfType["Formatted"]), resultLogHandler)
printWithWriteLog(convertSecondsToHours(stats[lang].statsOfType["Free"]), resultLogHandler)
resultLogHandler.close()
if not IS_DEBUGGING:
resp = slackPushBot.postMessage(":black_square_for_stop: [" + deviceAlias + "] -- 다양성 통계(아동) 집계 종료\n" + targetPath)
resultLogHandler = open(resultLogPath, 'r', encoding="utf-8")
slackPushBot.postMessage("🆗 결과\n\n```" + resultLogHandler.read() + "```", resp["ts"])
resultLogHandler.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment