import pandas as pd
import numpy as np
import polars as pl
from os import listdir
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer, models
from datasets import load_dataset
from sentence_transformers import InputExample
from torch.utils.data import DataLoader
from sentence_transformers import losses
set()
sns.%matplotlib inline
Transformer embeddings for clinical NLP
10 05 24
Objective:
- Develop a set of embeddings that is suitable for clinical record linkage, for example identifying which pathology test result names are the same, including the use of abbreviations
References:
Clinical abbreviation datasets - https://www.nature.com/articles/s41597-021-00929-4 - https://github.com/lisavirginia/clinical-abbreviations
Fine-tuning transformers (HuggingFace) - https://huggingface.co/blog/how-to-train-sentence-transformers
Data sources: Load and clean
Using the clinical abbreviations datasets mentioned in the references
= '/Users/alexlee/Desktop/Data/clinical/clinical_abbreviations/' source_folder
= listdir(source_folder) filenames
= (
df1
pdf'{source_folder}/{filenames[0]}', sep='=', header=None, names=['abbreviation', 'sense'])
.read_csv(=lambda df_: df_.abbreviation.str.strip(),
.assign(abbreviation=lambda df_: df_.sense.str.strip())
sense )
= (
df2
pdf'{source_folder}/{filenames[1]}', sep='\t', header=None, names=['abbreviation', 'sense', 'similarity'])
.read_csv(=lambda df_: df_.abbreviation.str.strip(),
.assign(abbreviation=lambda df_: df_.sense.str.strip())
sense )
= (
df3
pdf'{source_folder}/{filenames[2]}', sep=',', names=['abbreviation', 'sense'])
.read_csv(=lambda df_: df_.abbreviation.str.strip(),
.assign(abbreviation=lambda df_: df_.sense.str.strip())
sense )
= (
df4
pdf'{source_folder}/vanderbilt_clinic_notes.txt', sep='\t')
.read_csv(=lambda df_: df_.abbreviation.str.strip(),
.assign(abbreviation=lambda df_: df_.sense.str.strip())
sense )
= (
df5
pdf'{source_folder}/vanderbilt_discharge_sums.txt', sep='\t')
.read_csv(=lambda df_: df_.abbreviation.str.strip(),
.assign(abbreviation=lambda df_: df_.sense.str.strip())
sense )
= pd.concat([df1, df2, df3, df4, df4]) df_all
= (
df_all
df_all'abbreviation', 'sense']]
.loc[:, [
.drop_duplicates()=['abbreviation'])
.sort_values(by'abbreviation.isnull() == False')
.query('sense.isnull() == False')
.query(
.reset_index()1:]
.iloc[:,
)
# training data
= df_all.values train_data
Load model
## Step 1: use an existing language model
= models.Transformer('distilroberta-base') word_embedding_model
/Users/alexlee/Desktop/Coding/transformers/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
## Step 2: use a pool function over the token embeddings
= models.Pooling(word_embedding_model.get_word_embedding_dimension())
pooling_model
## Join steps 1 and 2 using the modules argument
= SentenceTransformer(modules=[word_embedding_model, pooling_model]) model
Create dataloader object
= []
train_examples
= len(train_data)
n_examples
# convert each of the examples
for i in range(n_examples):
= train_data[i]
example =[example[0], example[1]])) train_examples.append(InputExample(texts
= DataLoader(train_examples, shuffle=True, batch_size=16) train_dataloader
Loss function
Use MultipleNegativesRankingLoss since our training data consists of pairs of similar strings
= losses.MultipleNegativesRankingLoss(model=model) train_loss
Fine-tuning
=[(train_dataloader, train_loss)], epochs=10) model.fit(train_objectives
'clinical_embeddings_100524') model.save(
Evaluation
# initially inspecting some of the matches
= ['sob',
texts 'shortness of breath',
'hbg',
'plt',
'bilirubin',
'haemoglobin',
'platelets',
'alp',
'alkaline phosphatase',
'hb', 'hb.', 'plt.', 'plat', 's.o.b', 'sob on arrival']
= model.encode(texts) texts_emb
= []
s1 = []
s2 = []
scores
for n in range(len(texts)):
for m in range(len(texts)):
s1.append(texts[n])
s2.append(texts[m])= texts_emb[n].dot(texts_emb[m].T)
score scores.append(score)
= pd.DataFrame(data={'text1': s1,
df 'text2': s2,
'similarity': scores})
= 'sob on arrival' query
f'text1 == "{query}"').sort_values(by='similarity', ascending=False) df.query(
text1 | text2 | similarity | |
---|---|---|---|
224 | sob on arrival | sob on arrival | 266.899292 |
210 | sob on arrival | sob | 183.777115 |
211 | sob on arrival | shortness of breath | 130.903046 |
223 | sob on arrival | s.o.b | 107.093262 |
215 | sob on arrival | haemoglobin | 45.915550 |
214 | sob on arrival | bilirubin | 27.788834 |
218 | sob on arrival | alkaline phosphatase | 15.709982 |
219 | sob on arrival | hb | 14.771133 |
220 | sob on arrival | hb. | 8.055744 |
217 | sob on arrival | alp | -5.120555 |
212 | sob on arrival | hbg | -10.904447 |
222 | sob on arrival | plat | -18.356281 |
216 | sob on arrival | platelets | -39.142982 |
221 | sob on arrival | plt. | -47.600090 |
213 | sob on arrival | plt | -53.838760 |