from datasets import load_dataset
from itertools import islice
import numpy as np
import pandas as pd
import random
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity
import os
from sentence_transformers import SentenceTransformer, util
import seaborn as sns
import matplotlib.pyplot as plt


p10k = load_dataset("NeelNanda/pile-10k", split="train")

Using custom data configuration NeelNanda--pile-10k-72f566e9f7c464ab
Found cached dataset parquet (/Users/venu/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


p10k[0, 1].keys()

dict_keys(['text', 'meta'])


p10k[0, 0]["meta"][0]

{'pile_set_name': 'Pile-CC'}


p10k[80, 0]["meta"][0]['pile_set_name']

'StackExchange'


# get pile_set_name for each example, and count frequencies of each pile_set_name
names = [p10k[i, 0]["meta"][0]['pile_set_name'] for i in range(len(p10k))]
from collections import Counter
Counter(names)

Counter({'Pile-CC': 2524,
         'Github': 855,
         'OpenWebText2': 1520,
         'StackExchange': 1399,
         'Wikipedia (en)': 779,
         'PubMed Abstracts': 1423,
         'USPTO Backgrounds': 514,
         'FreeLaw': 241,
         'PubMed Central': 259,
         'Enron Emails': 47,
         'HackerNews': 81,
         'NIH ExPorter': 104,
         'Books3': 9,
         'ArXiv': 91,
         'DM Mathematics': 99,
         'OpenSubtitles': 27,
         'BookCorpus2': 2,
         'Ubuntu IRC': 2,
         'YoutubeSubtitles': 11,
         'EuroParl': 6,
         'PhilPapers': 5,
         'Gutenberg (PG-19)': 2})


# make a dictionary with pile_set_name as key, and pick the first example 
# from each pile_set_name that's at least num_chunks * chunk_size characters long
samples = {}
num_chunks = 10
chunk_size = 1000
for i in range(len(p10k)):
    name = p10k[i, 0]["meta"][0]['pile_set_name']
    text = p10k[i, 0]["text"][0]
    if name not in samples and len(text) > chunk_size * num_chunks:
        # chunk up text into chunks of chunk_size characters each
        chunks = [text[i:i+chunk_size] for i in range(0, chunk_size*num_chunks, chunk_size)]
        samples[name] = chunks


samples.keys()

dict_keys(['Pile-CC', 'USPTO Backgrounds', 'FreeLaw', 'PubMed Central', 'StackExchange', 'Books3', 'OpenWebText2', 'ArXiv', 'Github', 'OpenSubtitles', 'Wikipedia (en)', 'BookCorpus2', 'HackerNews', 'Ubuntu IRC', 'YoutubeSubtitles', 'PhilPapers', 'Gutenberg (PG-19)', 'EuroParl'])


len(samples.items())
for k, v in islice(samples.items(), 1):
    print(k, len(v), v[0])

Pile-CC 10 It is done, and submitted. You can play “Survival of the Tastiest” on Android, and on the web. Playing on the web works, but you have to simulate multi-touch for table moving and that can be a bit confusing.

There’s a lot I’d like to talk about. I’ll go through every topic, insted of making the typical what went right/wrong list.

Concept

Working over the theme was probably one of the hardest tasks I had to face.

Originally, I had an idea of what kind of game I wanted to develop, gameplay wise – something with lots of enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident I could fit any theme around it.

In the end, the problem with a theme like “Evolution” in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game?


# Common function for computing similarity matrix and visualizing it
def compute_similarities_and_plot(embs: dict, name: str, num_classes=6):
    selected_samples = dict(islice(samples.items(), num_classes))
    sims_df = pd.DataFrame(columns=selected_samples.keys(), index=selected_samples.keys())
    random.seed(42)
    for k1, v1 in selected_samples.items():
        for k2, v2 in selected_samples.items():
            # get average of 10 cosine similarity values
            sum = 0
            l = 10
            for i in range(l):
                rnd1 = random.randint(0, len(v1)-1)
                rnd2 = random.randint(0, len(v2)-1)
                sum += cosine_similarity(embs[k1][rnd1], embs[k2][rnd2])
            sims_df.loc[k1][k2] = float(sum / l)
    # not sure why I'm needing this, but otherwise it's inferring the type as object
    sims_df = sims_df.apply(pd.to_numeric, errors='coerce') 

    plt.figure(figsize=(6, 4))  # increase figure size for better visibility
    sns.heatmap(sims_df, annot=True, cmap='YlGnBu', vmin=0, vmax=1)
    plt.title(f'Avg. cosine similarities using {name} embeddings')

    # get average and std. dev of the entries along the diagonal in sims_df
    text1 = "Average self-similarity {:.2f}, std.dev {:.2f}".format(np.mean(np.diag(sims_df)), np.std(np.diag(sims_df)))

    # get average and std. dev of all the entries except the diagonal in sims_df
    non_diagonal_ix = np.where(~np.eye(sims_df.values.shape[0],dtype=bool))
    non_diagonal_mean = np.mean(sims_df.values[non_diagonal_ix])
    non_diagonal_std = np.std(sims_df.values[non_diagonal_ix])
    text2 = "Average non-self-similarity {:.2f}, std.dev {:.2f}".format(non_diagonal_mean, non_diagonal_std)

    plt.text(0, -1, text1 + "\n" + text2, fontsize=12)

    plt.show()

    return sims_df


openai.api_key = os.getenv("OPENAI_API_KEY")
embedding_model = "text-embedding-ada-002"


cosine_similarity(get_embedding(samples["Pile-CC"][0], engine=embedding_model), get_embedding(samples["Pile-CC"][1], engine=embedding_model))

0.8459940907245606


len(get_embedding(samples["Pile-CC"][0], engine=embedding_model))

1536


cosine_similarity(get_embedding(samples["Pile-CC"][0], engine=embedding_model), get_embedding(samples["FreeLaw"][0], engine=embedding_model))

0.6870063565524391


openai_embs = {}
for k1, v1 in samples.items():
    openai_embs[k1] = [get_embedding(v1[i], engine=embedding_model) for i in range(len(v1))]


sims_df = compute_similarities_and_plot(openai_embs, "OpenAI")


model = SentenceTransformer('all-MiniLM-L6-v2')


util.cos_sim(model.encode(samples["Pile-CC"][0]), model.encode(samples["Pile-CC"][1]))

tensor([[0.6097]])


# Double check that openai's implementation of cosine similarity returns the same results
cosine_similarity(model.encode(samples["Pile-CC"][0]), model.encode(samples["Pile-CC"][1]))

0.60973257


util.cos_sim(model.encode(samples["Pile-CC"][0]), model.encode(samples["FreeLaw"][0]))

tensor([[0.0098]])


sbert_embs = {}
for k1, v1 in samples.items():
    sbert_embs[k1] = [model.encode(v1[i]) for i in range(len(v1))]


sbert_sims = compute_similarities_and_plot(sbert_embs, "SBERT")


len(openai_embs["ArXiv"][0])

1536


len(sbert_embs["ArXiv"][0])

384


# convert openai_embs, which is currently a dictionary, to a matrix
openai_embs_matrix = np.zeros((len(openai_embs)*len(openai_embs["ArXiv"]), len(openai_embs["ArXiv"][0])))


sbert_embs_matrix = np.zeros((len(sbert_embs)*len(sbert_embs["ArXiv"]), len(sbert_embs["ArXiv"][0])))


for i, (k, v) in enumerate(openai_embs.items()):
    for j in range(len(v)):
        openai_embs_matrix[i*len(v)+j] = openai_embs[k][j]
        sbert_embs_matrix[i*len(v)+j] = sbert_embs[k][j]


# make sure we built the matrices correctly
print(np.sum(openai_embs_matrix[0] - openai_embs["Pile-CC"][0]))
print(np.sum(sbert_embs_matrix[0] - sbert_embs["Pile-CC"][0]))

0.0
0.0


# make sure we built the matrices correctly
print(np.count_nonzero(openai_embs_matrix) - np.shape(openai_embs_matrix)[0] * np.shape(openai_embs_matrix)[1])
print(np.count_nonzero(sbert_embs_matrix) - np.shape(sbert_embs_matrix)[0] * np.shape(sbert_embs_matrix)[1])

0
0


# let's get std. dev along each column of openai_embs_matrix
openai_std = np.sqrt(np.var(openai_embs_matrix, axis=0))
sbert_std = np.sqrt(np.var(sbert_embs_matrix, axis=0))
# let's plot the distribution of std. dev in the same plot
plt.figure(figsize=(6, 4))  # increase figure size for better visibility
sns.histplot(sbert_std, kde=True, label='SBERT')
sns.histplot(openai_std, kde=True, label='OpenAI')
plt.legend()
plt.title('The distribution of std. dev. within each dimension of OpenAI vs SBERT embeddings')
plt.show()


oa2_embs = {}
for k1, v1 in samples.items():
    oa2_embs[k1] = [openai.Embedding.create(model=embedding_model,input=v1[i],)["data"][0]["embedding"] for i in range(len(v1))]


_ = compute_similarities_and_plot(oa2_embs, "OpenAI2")

OpenAI vs SBERT embeddings on similar vs dissimilar texts¶

The expectation¶

The findings¶

Speculation¶

Imports¶

Load subset of Pile dataset¶

Get OpenAI embeddings of each of the chunks¶

Commentary on the results¶

Get SBERT embeddings of each of the chunks¶

Commentary on the results¶

Compare the dimensions of OpenAI's vs SBERT's embeddings¶

Commentary on the results¶

Try second way of getting OpenAI embeddings¶

Commentary on the results¶