

1.1 自然语言处理的核心任务



1.2 NLP在医疗领域的应用





2.1 自然语言处理的核心技术



2.2 NLP在医疗领域的技术挑战




3.1 语音识别




$$ y = Wx + b $$

其中,$y$ 是输出,$x$ 是输入,$W$ 是权重矩阵,$b$ 是偏置向量。

3.2 文本分类




$$ P(c|w) = \frac{\exp(\thetac^T \phi(w))}{\sum{c' \in C} \exp(\theta_{c'}^T \phi(w))} $$

其中,$P(c|w)$ 是文本$w$属于类别$c$的概率,$\theta_c$ 是类别$c$的参数向量,$\phi(w)$ 是文本$w$的特征向量。

3.3 命名实体识别




$$ \arg \maxc \sum{i=1}^n \log P(w_i|c) $$

其中,$c$ 是实体类别,$wi$ 是单词序列中的第$i$个单词,$P(wi|c)$ 是单词$w_i$属于实体类别$c$的概率。

3.4 关系抽取




$$ P(r|e1, e2) = \frac{\exp(\thetar^T \phi(e1, e2))}{\sum{r' \in R} \exp(\theta{r'}^T \phi(e1, e_2))} $$

其中,$P(r|e1, e2)$ 是实体$e1$和$e2$之间的关系$r$的概率,$\thetar$ 是关系$r$的参数向量,$\phi(e1, e2)$ 是实体$e1$和$e_2$的特征向量。


4.1 语音识别


```python import librosa import numpy as np import torch import torch.nn as nn import torch.optim as optim


y, sr = librosa.load("audio.wav", sr=None)


mfcc = librosa.feature.mfcc(y=y, sr=sr)


class RNN(nn.Module): def init(self, inputdim, hiddendim, outputdim): super(RNN, self).init() self.hiddendim = hiddendim self.rnn = nn.RNN(inputdim, hiddendim, batchfirst=True) self.fc = nn.Linear(hiddendim, outputdim)

def forward(self, x):

h0 = torch.zeros(1, x.size(0), self.hidden_dim)

out, _ = self.rnn(x, h0)

out = self.fc(out[:, -1, :])

return out


inputdim = mfcc.shape[1] hiddendim = 128 outputdim = 26 model = RNN(inputdim, hiddendim, outputdim) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters())

for epoch in range(100): optimizer.zero_grad() output = model(mfcc) loss = criterion(output, labels) loss.backward() optimizer.step() ```

4.2 文本分类


```python import numpy as np import pandas as pd import torch import torch.nn as nn import torch.optim as optim from sklearn.modelselection import traintest_split


data = pd.read_csv("data.csv") texts = data["text"] labels = data["label"]


tokenizer = nltk.wordtokenize texts = [tokenizer(text) for text in texts] wordtoidx = {} idxtoword = {} for text in texts: for word in text: if word not in wordtoidx: wordtoidx[word] = len(wordtoidx) idxtoword[len(idxtoword)] = word vocabsize = len(wordtoidx)


texts = np.zeros((len(texts), len(texts[0]), vocabsize), dtype=np.float32) for i, text in enumerate(texts): for j, word in enumerate(text): texts[i, j, wordto_idx[word]] = 1


inputdim = vocabsize hiddendim = 128 outputdim = 2 model = nn.LSTM(inputdim, hiddendim, batch_first=True) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters())

for epoch in range(100): optimizer.zero_grad() output = model(texts) loss = criterion(output, labels) loss.backward() optimizer.step() ```

4.3 命名实体识别


```python import numpy as np import torch import torch.nn as nn import torch.optim as optim from torchtext.legacy import data


traindata = data.Field(sequential=True, batchfirst=True) traindata, testdata = traintestsplit(traindata, testsize=0.2)


traindata.buildvocab(traindata, maxsize=20000) testdata.buildvocab(testdata, maxsize=20000)


class LSTM(nn.Module): def init(self, inputdim, hiddendim, outputdim): super(LSTM, self).init() self.hiddendim = hiddendim self.lstm = nn.LSTM(inputdim, hiddendim, batchfirst=True) self.fc = nn.Linear(hiddendim, outputdim)

def forward(self, x):

h0 = torch.zeros(1, x.size(0), self.hidden_dim)

c0 = torch.zeros(1, x.size(0), self.hidden_dim)

out, _ = self.lstm(x, (h0, c0))

out = self.fc(out[:, -1, :])

return out


inputdim = traindata.vocab.vectors.size(0) hiddendim = 128 outputdim = len(traindata.vocab) model = LSTM(inputdim, hiddendim, outputdim) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters())

for epoch in range(100): optimizer.zerograd() output = model(traindata) loss = criterion(output, labels) loss.backward() optimizer.step() ```

4.4 关系抽取


```python import numpy as np import torch import torch.nn as nn import torch.optim as optim from torchtext.legacy import data


traindata = data.Field(sequential=True, batchfirst=True) traindata, testdata = traintestsplit(traindata, testsize=0.2)


traindata.buildvocab(traindata, maxsize=20000) testdata.buildvocab(testdata, maxsize=20000)


class BiLSTM(nn.Module): def init(self, inputdim, hiddendim, outputdim): super(BiLSTM, self).init() self.hiddendim = hiddendim self.embedding = nn.Embedding(inputdim, hiddendim) self.lstm = nn.LSTM(hiddendim, hiddendim, batchfirst=True) self.fc = nn.Linear(hiddendim * 2, outputdim)

def forward(self, x):

x = self.embedding(x)

x = torch.cat((x, x.permute(0, 1, 3, 2)), dim=1)

h0 = torch.zeros(2, x.size(0), self.hidden_dim)

c0 = torch.zeros(2, x.size(0), self.hidden_dim)

out, _ = self.lstm(x, (h0, c0))

out = self.fc(out)

return out


inputdim = traindata.vocab.vectors.size(0) hiddendim = 128 outputdim = len(traindata.vocab) model = BiLSTM(inputdim, hiddendim, outputdim) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters())

for epoch in range(100): optimizer.zerograd() output = model(traindata) loss = criterion(output, labels) loss.backward() optimizer.step() ```














