## 게시판 스크래핑
### 페이지 바꾸기
import requests
국민대 홈페이지 게시판 URL. `pn=` 부분이 페이지를 나타낸다. `{}`로 페이지 번호가 들어갈 자리를 표시한다.
url = '{}'
페이지를 0번부터 9번까지 바꿔가며 출력한다
"for page in range(10):\n",
!pip install lxml
!pip install cssselect
import lxml.html
### 게시물 URL 가져오기
0번 페이지 가져오기
res = requests.get(url.format(0))
root = lxml.html.fromstring(res.text)
from urllib.parse import urljoin
"for link in root.cssselect('.boardlist a'): # class=\"boardlist\" 아래에 있는 a 링크를 모두 모아서\n",
### 게시물 내용 가져오기
res = requests.get('')
res.encoding = 'utf8'
root = lxml.html.fromstring(res.text)
content = root.cssselect('#view-detail-data')
'국민대학교 창업보육센터 계약직원 모집...'
"### 종합\n",
"article_urls = []\n",
수집한 주소의 게시물 본문을 수집한다
"contents = []\n",
## 11.2 Word Embedding
(교재와 동일)
"import requests\n",
" 'GOLDEN',\n",
!pip install gensim
from gensim.models.word2vec import Word2Vec
"model = Word2Vec(data, # 리스트 형태의 데이터\n",
model.wv.similarity('princess', 'queen')
/usr/local/lib/python3.6/dist-packages/gensim/ FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
 if np.issubdtype(vec.dtype,
0.9875084757804871
/usr/local/lib/python3.6/dist-packages/gensim/ FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
 if np.issubdtype(vec.dtype,
"[('fox', 0.9914872646331787),\n",
model.wv.most_similar(positive=['man', 'princess'], negative=['woman'])
/usr/local/lib/python3.6/dist-packages/gensim/ FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
 if np.issubdtype(vec.dtype,
"[('cat', 0.9745951890945435),\n",
"from keras.layers import Embedding"
Using TensorFlow backend.
NUM_WORDS, EMB_DIM = model.wv.vectors.shape
(1818, 100)
nn = Sequential()
"emb = Embedding(input_dim=NUM_WORDS, output_dim=EMB_DIM,\n",
"--2018-11-09 04:50:09--\n",
" inflating: ko.bin \n",
kovec = Word2Vec.load('ko.bin')
/usr/local/lib/python3.6/dist-packages/gensim/ FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
 if np.issubdtype(vec.dtype,
"[('국왕', 0.6174007654190063),\n",
"## 11.5 ELMo 실습\n",
!pip install tensorflow-hub
"Requirement already satisfied: tensorflow-hub in /usr/local/lib/python3.6/dist-packages (0.1.1)\n",
import tensorflow_hub as hub
elmo = hub.Module("", trainable=True)
"INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.\n",
"import tensorflow as tf\n",
Using TensorFlow backend.
"def elmo_embedding(x):\n",
