import os
import pandas as pd
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objects as go

os.makedirs('../plotly', exist_ok=True)

vocab = AutoTokenizer.from_pretrained(
    "google-bert/bert-base-uncased",
).vocab
words_in_order = sorted(vocab.keys(), key = lambda x: vocab[x])

X = AutoModelForMaskedLM.from_pretrained(
    "google-bert/bert-base-uncased",
    torch_dtype=torch.float16,
    device_map="cpu",
    attn_implementation="sdpa"
).bert.embeddings.word_embeddings.weight.detach()

dims = (2, 3, 4)

pcas = dict.fromkeys(dims)

for dim in dims:
    pca = PCA(n_components=dim)
    pcas[dim] = pca.fit_transform(X)

dfs = dict.fromkeys(dims)

for dim in dims:
    d = {
        'word': words_in_order
    }
    for i in range(1, dim+1):
        d[f'pc{i}'] = pcas[dim][:, i-1].tolist()
        
    dfs[dim] = pd.DataFrame(d)

legend_dict = dict(
    orientation='h',
    y=-0.15,
)

marker_dict = dict(
    size=3,
    opacity=0.4,
)

layout = go.Layout(
    margin = go.layout.Margin(
        b=20,
        t=50,
    )
)

fig = px.scatter(
    dfs[2],
    x='pc1',
    y='pc2',
    title=f'BERT: PCA in 2 dimensions',
    hover_data={'word': True, 'pc1': False, 'pc2': False},
    height=450,
    width=800
)
fig.update_layout(legend=legend_dict, margin=layout.margin, title_x=0.5)
fig.update_traces(marker=marker_dict)
fig.show()
fig.write_html('../plotly/bert_pca_2.html')

fig = px.scatter_3d(
    dfs[3],
    x='pc1',
    y='pc2',
    z='pc3',
    title=f'BERT: PCA in 3 dimensions',
    hover_data={'word': True, 'pc1': False, 'pc2': False, 'pc3': False},
    height=450,
    width=800
)
fig.update_layout(legend=legend_dict, margin=layout.margin, title_x=0.5)
fig.update_traces(marker=marker_dict)
fig.show()
fig.write_html('../plotly/bert_pca_3.html')

fig = px.scatter(
    dfs[3],
    x='pc1',
    y='pc2',
    color='pc3',
    title=f'BERT: PCA in 3 dimensions',
    hover_data={'word': True, 'pc1': False, 'pc2': False, 'pc3': False},
    height=450,
    width=800
)
fig.update_layout(legend=legend_dict, margin=layout.margin, title_x=0.5)
fig.update_traces(marker=marker_dict)
fig.show()
fig.write_html('../plotly/bert_pca_3_color.html')

fig = px.scatter_3d(
    dfs[4],
    x='pc1',
    y='pc2',
    z='pc3',
    color='pca4',
    title=f'BERT: PCA in 4 dimensions',
    hover_data={'word': True, 'pc1': False, 'pc2': False, 'pc3': False, 'pc4': False},
    height=450,
    width=800
)
fig.update_layout(legend=legend_dict, margin=layout.margin, title_x=0.5)
fig.update_traces(marker=marker_dict)
fig.show()
fig.write_html('../plotly/bert_pca_4.html')

BERT's word embeddings¶

References¶