Skip to content

Instantly share code, notes, and snippets.

@soeque1
Last active February 18, 2022 08:26
Show Gist options
  • Save soeque1/3c8e98eec52675bf0ea2 to your computer and use it in GitHub Desktop.
Save soeque1/3c8e98eec52675bf0ea2 to your computer and use it in GitHub Desktop.
R 형태소 분석기 테스트용(한나눔, 꼬꼬마, 트위터)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"library(rJava)\n",
".jinit()\n",
".jaddClassPath('/Library/Frameworks/R.framework/Versions/3.2/Resources/library/KoNLP/java/kkma-2.0.jar')\n",
".jaddClassPath('/Library/Frameworks/R.framework/Versions/3.2/Resources/library/KoNLP/java/quantlab.jar')\n",
"KkmaObj <- .jnew('kr.lucypark.kkma.KkmaInterface')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"kkma.extractNoun <- function(strings)\n",
"{\n",
" library(stringr)\n",
" res = .jrcall(KkmaObj, 'extractNoun', strings)\n",
" res = .jstrVal(.jcast(res, \"java/lang/String\"))\n",
" res = str_extract_all(res, \"[가-힣]+\")[[1]]\n",
" res\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
".jaddClassPath('/Library/Frameworks/R.framework/Versions/3.2/Resources/library/KoNLP/java/korean-text-3.0.jar')\n",
".jaddClassPath('/Library/Frameworks/R.framework/Versions/3.2/Resources/library/KoNLP/java/scala-library-2.11.6.jar')\n",
".jaddClassPath('/Library/Frameworks/R.framework/Versions/3.2/Resources/library/KoNLP/java/twitter-text-1.11.1.jar')\n",
"twitterObj <- .jnew('com.twitter.penguin.korean.TwitterKoreanProcessorJava$Builder')\n",
"twitterObj <- .jrcall(twitterObj, 'build')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"twi.extractNoun <- function(strings)\n",
"{\n",
" library(stringr)\n",
" res = .jrcall(twitterObj, 'tokenize', strings)\n",
" res = .jstrVal(res)\n",
" res = str_match_all(res, \"([가-힣]+)\\\\(Noun\")[[1]][,2]\n",
" res\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading required package: stringr\n",
"Loading required package: hash\n",
"hash-2.2.6 provided by Decision Patterns\n",
"\n",
"Loading required package: tau\n",
"Loading required package: Sejong\n",
"Successfully Loaded Sejong Package.\n",
"Checking user defined dictionary!\n",
"\n",
"\n",
"Attaching package: ‘KoNLP’\n",
"\n",
"The following object is masked from ‘package:tau’:\n",
"\n",
" is.ascii\n",
"\n"
]
}
],
"source": [
"library(KoNLP)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"'오늘도술로밤을세우'"
],
"text/latex": [
"'오늘도술로밤을세우'"
],
"text/markdown": [
"'오늘도술로밤을세우'"
],
"text/plain": [
"[1] \"오늘도술로밤을세우\""
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/html": [
"<ol class=list-inline>\n",
"\t<li>'오늘'</li>\n",
"\t<li>'술'</li>\n",
"\t<li>'밤'</li>\n",
"</ol>\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item '오늘'\n",
"\\item '술'\n",
"\\item '밤'\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. '오늘'\n",
"2. '술'\n",
"3. '밤'\n",
"\n",
"\n"
],
"text/plain": [
"[1] \"오늘\" \"술\" \"밤\" "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/html": [
"<ol class=list-inline>\n",
"\t<li>'오늘'</li>\n",
"\t<li>'도술'</li>\n",
"\t<li>'밤'</li>\n",
"</ol>\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item '오늘'\n",
"\\item '도술'\n",
"\\item '밤'\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. '오늘'\n",
"2. '도술'\n",
"3. '밤'\n",
"\n",
"\n"
],
"text/plain": [
"[1] \"오늘\" \"도술\" \"밤\" "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extractNoun('오늘도술로밤을세우고')\n",
"kkma.extractNoun('오늘도술로밤을세우고')\n",
"twi.extractNoun('오늘도술로밤을세우고')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<ol class=list-inline>\n",
"\t<li>'오늘'</li>\n",
"\t<li>'술'</li>\n",
"\t<li>'밤'</li>\n",
"\t<li>'세우'</li>\n",
"</ol>\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item '오늘'\n",
"\\item '술'\n",
"\\item '밤'\n",
"\\item '세우'\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. '오늘'\n",
"2. '술'\n",
"3. '밤'\n",
"4. '세우'\n",
"\n",
"\n"
],
"text/plain": [
"[1] \"오늘\" \"술\" \"밤\" \"세우\""
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/html": [
"<ol class=list-inline>\n",
"\t<li>'오늘'</li>\n",
"\t<li>'술'</li>\n",
"\t<li>'밤'</li>\n",
"</ol>\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item '오늘'\n",
"\\item '술'\n",
"\\item '밤'\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. '오늘'\n",
"2. '술'\n",
"3. '밤'\n",
"\n",
"\n"
],
"text/plain": [
"[1] \"오늘\" \"술\" \"밤\" "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/html": [
"<ol class=list-inline>\n",
"\t<li>'오늘'</li>\n",
"\t<li>'술'</li>\n",
"\t<li>'밤'</li>\n",
"</ol>\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item '오늘'\n",
"\\item '술'\n",
"\\item '밤'\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. '오늘'\n",
"2. '술'\n",
"3. '밤'\n",
"\n",
"\n"
],
"text/plain": [
"[1] \"오늘\" \"술\" \"밤\" "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extractNoun('오늘도 술로 밤을 세우고')\n",
"kkma.extractNoun('오늘도 술로 밤을 세우고')\n",
"twi.extractNoun('오늘도 술로 밤을 세우고')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<ol class=list-inline>\n",
"\t<li>'public static void kr.lucypark.kkma.KkmaInterface.main(java.lang.String[]) throws java.lang.Exception'</li>\n",
"\t<li>'public org.snu.ids.ha.index.KeywordList kr.lucypark.kkma.KkmaInterface.extractNoun(java.lang.String)'</li>\n",
"\t<li>'public java.util.List kr.lucypark.kkma.KkmaInterface.morphAnalyzer(java.lang.String) throws java.lang.Exception'</li>\n",
"\t<li>'public final void java.lang.Object.wait(long,int) throws java.lang.InterruptedException'</li>\n",
"\t<li>'public final native void java.lang.Object.wait(long) throws java.lang.InterruptedException'</li>\n",
"\t<li>'public final void java.lang.Object.wait() throws java.lang.InterruptedException'</li>\n",
"\t<li>'public boolean java.lang.Object.equals(java.lang.Object)'</li>\n",
"\t<li>'public java.lang.String java.lang.Object.toString()'</li>\n",
"\t<li>'public native int java.lang.Object.hashCode()'</li>\n",
"\t<li>'public final native java.lang.Class java.lang.Object.getClass()'</li>\n",
"\t<li>'public final native void java.lang.Object.notify()'</li>\n",
"\t<li>'public final native void java.lang.Object.notifyAll()'</li>\n",
"</ol>\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item 'public static void kr.lucypark.kkma.KkmaInterface.main(java.lang.String[]) throws java.lang.Exception'\n",
"\\item 'public org.snu.ids.ha.index.KeywordList kr.lucypark.kkma.KkmaInterface.extractNoun(java.lang.String)'\n",
"\\item 'public java.util.List kr.lucypark.kkma.KkmaInterface.morphAnalyzer(java.lang.String) throws java.lang.Exception'\n",
"\\item 'public final void java.lang.Object.wait(long,int) throws java.lang.InterruptedException'\n",
"\\item 'public final native void java.lang.Object.wait(long) throws java.lang.InterruptedException'\n",
"\\item 'public final void java.lang.Object.wait() throws java.lang.InterruptedException'\n",
"\\item 'public boolean java.lang.Object.equals(java.lang.Object)'\n",
"\\item 'public java.lang.String java.lang.Object.toString()'\n",
"\\item 'public native int java.lang.Object.hashCode()'\n",
"\\item 'public final native java.lang.Class java.lang.Object.getClass()'\n",
"\\item 'public final native void java.lang.Object.notify()'\n",
"\\item 'public final native void java.lang.Object.notifyAll()'\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. 'public static void kr.lucypark.kkma.KkmaInterface.main(java.lang.String[]) throws java.lang.Exception'\n",
"2. 'public org.snu.ids.ha.index.KeywordList kr.lucypark.kkma.KkmaInterface.extractNoun(java.lang.String)'\n",
"3. 'public java.util.List kr.lucypark.kkma.KkmaInterface.morphAnalyzer(java.lang.String) throws java.lang.Exception'\n",
"4. 'public final void java.lang.Object.wait(long,int) throws java.lang.InterruptedException'\n",
"5. 'public final native void java.lang.Object.wait(long) throws java.lang.InterruptedException'\n",
"6. 'public final void java.lang.Object.wait() throws java.lang.InterruptedException'\n",
"7. 'public boolean java.lang.Object.equals(java.lang.Object)'\n",
"8. 'public java.lang.String java.lang.Object.toString()'\n",
"9. 'public native int java.lang.Object.hashCode()'\n",
"10. 'public final native java.lang.Class java.lang.Object.getClass()'\n",
"11. 'public final native void java.lang.Object.notify()'\n",
"12. 'public final native void java.lang.Object.notifyAll()'\n",
"\n",
"\n"
],
"text/plain": [
" [1] \"public static void kr.lucypark.kkma.KkmaInterface.main(java.lang.String[]) throws java.lang.Exception\" \n",
" [2] \"public org.snu.ids.ha.index.KeywordList kr.lucypark.kkma.KkmaInterface.extractNoun(java.lang.String)\" \n",
" [3] \"public java.util.List kr.lucypark.kkma.KkmaInterface.morphAnalyzer(java.lang.String) throws java.lang.Exception\"\n",
" [4] \"public final void java.lang.Object.wait(long,int) throws java.lang.InterruptedException\" \n",
" [5] \"public final native void java.lang.Object.wait(long) throws java.lang.InterruptedException\" \n",
" [6] \"public final void java.lang.Object.wait() throws java.lang.InterruptedException\" \n",
" [7] \"public boolean java.lang.Object.equals(java.lang.Object)\" \n",
" [8] \"public java.lang.String java.lang.Object.toString()\" \n",
" [9] \"public native int java.lang.Object.hashCode()\" \n",
"[10] \"public final native java.lang.Class java.lang.Object.getClass()\" \n",
"[11] \"public final native void java.lang.Object.notify()\" \n",
"[12] \"public final native void java.lang.Object.notifyAll()\" "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
".jmethods(KkmaObj)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"res = .jrcall(KkmaObj, 'morphAnalyzer', '오늘도술로밤을세우고')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"'[[오늘도\t\t\t=> [0/오늘/NNG+2/도/JX], 술로\t\t\t=> [3/술/NNG+4/로/JKM], 밤을\t\t\t=> [5/밤/NNG+6/을/JKO], 세우고\t\t\t=> [7/세우/VV+9/고/ECE]]]'"
],
"text/latex": [
"'[[오늘도\t\t\t=> [0/오늘/NNG+2/도/JX], 술로\t\t\t=> [3/술/NNG+4/로/JKM], 밤을\t\t\t=> [5/밤/NNG+6/을/JKO], 세우고\t\t\t=> [7/세우/VV+9/고/ECE]]]'"
],
"text/markdown": [
"'[[오늘도\t\t\t=> [0/오늘/NNG+2/도/JX], 술로\t\t\t=> [3/술/NNG+4/로/JKM], 밤을\t\t\t=> [5/밤/NNG+6/을/JKO], 세우고\t\t\t=> [7/세우/VV+9/고/ECE]]]'"
],
"text/plain": [
"[1] \"[[오늘도\\t\\t\\t=> [0/오늘/NNG+2/도/JX], 술로\\t\\t\\t=> [3/술/NNG+4/로/JKM], 밤을\\t\\t\\t=> [5/밤/NNG+6/을/JKO], 세우고\\t\\t\\t=> [7/세우/VV+9/고/ECE]]]\""
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
".jstrVal(res)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<ol class=list-inline>\n",
"\t<li>'public java.lang.CharSequence com.twitter.penguin.korean.TwitterKoreanProcessorJava.normalize(java.lang.CharSequence)'</li>\n",
"\t<li>'public java.util.List com.twitter.penguin.korean.TwitterKoreanProcessorJava.tokenize(java.lang.CharSequence)'</li>\n",
"\t<li>'public java.lang.CharSequence com.twitter.penguin.korean.TwitterKoreanProcessorJava.stem(java.lang.CharSequence)'</li>\n",
"\t<li>'public java.util.List com.twitter.penguin.korean.TwitterKoreanProcessorJava.tokenizeToStrings(java.lang.CharSequence)'</li>\n",
"\t<li>'public java.util.List com.twitter.penguin.korean.TwitterKoreanProcessorJava.extractPhrases(java.lang.CharSequence)'</li>\n",
"\t<li>'public final void java.lang.Object.wait(long,int) throws java.lang.InterruptedException'</li>\n",
"\t<li>'public final native void java.lang.Object.wait(long) throws java.lang.InterruptedException'</li>\n",
"\t<li>'public final void java.lang.Object.wait() throws java.lang.InterruptedException'</li>\n",
"\t<li>'public boolean java.lang.Object.equals(java.lang.Object)'</li>\n",
"\t<li>'public java.lang.String java.lang.Object.toString()'</li>\n",
"\t<li>'public native int java.lang.Object.hashCode()'</li>\n",
"\t<li>'public final native java.lang.Class java.lang.Object.getClass()'</li>\n",
"\t<li>'public final native void java.lang.Object.notify()'</li>\n",
"\t<li>'public final native void java.lang.Object.notifyAll()'</li>\n",
"</ol>\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item 'public java.lang.CharSequence com.twitter.penguin.korean.TwitterKoreanProcessorJava.normalize(java.lang.CharSequence)'\n",
"\\item 'public java.util.List com.twitter.penguin.korean.TwitterKoreanProcessorJava.tokenize(java.lang.CharSequence)'\n",
"\\item 'public java.lang.CharSequence com.twitter.penguin.korean.TwitterKoreanProcessorJava.stem(java.lang.CharSequence)'\n",
"\\item 'public java.util.List com.twitter.penguin.korean.TwitterKoreanProcessorJava.tokenizeToStrings(java.lang.CharSequence)'\n",
"\\item 'public java.util.List com.twitter.penguin.korean.TwitterKoreanProcessorJava.extractPhrases(java.lang.CharSequence)'\n",
"\\item 'public final void java.lang.Object.wait(long,int) throws java.lang.InterruptedException'\n",
"\\item 'public final native void java.lang.Object.wait(long) throws java.lang.InterruptedException'\n",
"\\item 'public final void java.lang.Object.wait() throws java.lang.InterruptedException'\n",
"\\item 'public boolean java.lang.Object.equals(java.lang.Object)'\n",
"\\item 'public java.lang.String java.lang.Object.toString()'\n",
"\\item 'public native int java.lang.Object.hashCode()'\n",
"\\item 'public final native java.lang.Class java.lang.Object.getClass()'\n",
"\\item 'public final native void java.lang.Object.notify()'\n",
"\\item 'public final native void java.lang.Object.notifyAll()'\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. 'public java.lang.CharSequence com.twitter.penguin.korean.TwitterKoreanProcessorJava.normalize(java.lang.CharSequence)'\n",
"2. 'public java.util.List com.twitter.penguin.korean.TwitterKoreanProcessorJava.tokenize(java.lang.CharSequence)'\n",
"3. 'public java.lang.CharSequence com.twitter.penguin.korean.TwitterKoreanProcessorJava.stem(java.lang.CharSequence)'\n",
"4. 'public java.util.List com.twitter.penguin.korean.TwitterKoreanProcessorJava.tokenizeToStrings(java.lang.CharSequence)'\n",
"5. 'public java.util.List com.twitter.penguin.korean.TwitterKoreanProcessorJava.extractPhrases(java.lang.CharSequence)'\n",
"6. 'public final void java.lang.Object.wait(long,int) throws java.lang.InterruptedException'\n",
"7. 'public final native void java.lang.Object.wait(long) throws java.lang.InterruptedException'\n",
"8. 'public final void java.lang.Object.wait() throws java.lang.InterruptedException'\n",
"9. 'public boolean java.lang.Object.equals(java.lang.Object)'\n",
"10. 'public java.lang.String java.lang.Object.toString()'\n",
"11. 'public native int java.lang.Object.hashCode()'\n",
"12. 'public final native java.lang.Class java.lang.Object.getClass()'\n",
"13. 'public final native void java.lang.Object.notify()'\n",
"14. 'public final native void java.lang.Object.notifyAll()'\n",
"\n",
"\n"
],
"text/plain": [
" [1] \"public java.lang.CharSequence com.twitter.penguin.korean.TwitterKoreanProcessorJava.normalize(java.lang.CharSequence)\"\n",
" [2] \"public java.util.List com.twitter.penguin.korean.TwitterKoreanProcessorJava.tokenize(java.lang.CharSequence)\" \n",
" [3] \"public java.lang.CharSequence com.twitter.penguin.korean.TwitterKoreanProcessorJava.stem(java.lang.CharSequence)\" \n",
" [4] \"public java.util.List com.twitter.penguin.korean.TwitterKoreanProcessorJava.tokenizeToStrings(java.lang.CharSequence)\"\n",
" [5] \"public java.util.List com.twitter.penguin.korean.TwitterKoreanProcessorJava.extractPhrases(java.lang.CharSequence)\" \n",
" [6] \"public final void java.lang.Object.wait(long,int) throws java.lang.InterruptedException\" \n",
" [7] \"public final native void java.lang.Object.wait(long) throws java.lang.InterruptedException\" \n",
" [8] \"public final void java.lang.Object.wait() throws java.lang.InterruptedException\" \n",
" [9] \"public boolean java.lang.Object.equals(java.lang.Object)\" \n",
"[10] \"public java.lang.String java.lang.Object.toString()\" \n",
"[11] \"public native int java.lang.Object.hashCode()\" \n",
"[12] \"public final native java.lang.Class java.lang.Object.getClass()\" \n",
"[13] \"public final native void java.lang.Object.notify()\" \n",
"[14] \"public final native void java.lang.Object.notifyAll()\" "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
".jmethods(twitterObj)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"res = .jrcall(twitterObj, 'tokenize', '오늘도술로밤을세우고')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"'[오늘(Noun: 0, 2), 도술(Noun: 2, 2), 로(Josa: 4, 1), 밤(Noun: 5, 1), 을(Josa: 6, 1), 세우다(Verb: 7, 3)]'"
],
"text/latex": [
"'[오늘(Noun: 0, 2), 도술(Noun: 2, 2), 로(Josa: 4, 1), 밤(Noun: 5, 1), 을(Josa: 6, 1), 세우다(Verb: 7, 3)]'"
],
"text/markdown": [
"'[오늘(Noun: 0, 2), 도술(Noun: 2, 2), 로(Josa: 4, 1), 밤(Noun: 5, 1), 을(Josa: 6, 1), 세우다(Verb: 7, 3)]'"
],
"text/plain": [
"[1] \"[오늘(Noun: 0, 2), 도술(Noun: 2, 2), 로(Josa: 4, 1), 밤(Noun: 5, 1), 을(Josa: 6, 1), 세우다(Verb: 7, 3)]\""
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
".jstrVal(res)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "R",
"language": "",
"name": "ir"
},
"language_info": {
"codemirror_mode": "r",
"file_extension": ".r",
"mimetype": "text/x-r-source",
"name": "R",
"pygments_lexer": "r",
"version": "3.2.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
@Park-youngjun
Copy link

혹시 r에서 kkma사용을 어떻게 하신건지 여쭤봐도 될까요?

@soeque1
Copy link
Author

soeque1 commented Feb 18, 2022

@Park-youngjun https://github.com/soeque1/KoNLPQ

참고해보세요! 예전 버전이라 잘 될지는 모르겠네요

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment