import pandas as pd
from d3lta.faissd3lta import semantic_faiss
import plotly_express as px
pd.set_option("max_colwidth", None)

2025-01-10 16:47:34.615428: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
E0000 00:00:1736527654.632597   12042 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736527654.637550   12042 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-10 16:47:34.654903: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

df_synth = pd.read_csv('../data/synthetic_dataset_documents.csv')
df_synth = df_synth.assign(doc_id=df_synth['doc_id'].astype(str)).set_index(["doc_id"])
df_synth.head(5)

px.histogram(
    df_synth.query('seed == True').assign(len = lambda x : x['original'].str.len()),
    x="len",
    color="text_type",
    nbins=100,
    barmode="overlay",
    marginal="box",
    title="Length distributions of seeds (by source)",
    labels={"len": "lenght"},
)

px.histogram(
    df_synth.query('seed == False').assign(len = lambda x : x['original'].str.len()),
    x="len",
    color="language",
    nbins=100,
    barmode="overlay",
    marginal="box",
    title="Length distributions of seeds (by source)",
    labels={"len": "lenght"},
)

%%time
matches_synth, df_clusters = semantic_faiss(
    df = df_synth,
    min_size_txt = 1,
    df_embeddings_use = None,
    embeddings_to_save = 'faiss_synth_test',
    threshold_grapheme = 0.7,
    threshold_language = 0.748, 
    threshold_semantic = 0.7478, 
    remove_matches_same_user = None
)

>>> Start prepare_dataset

Done.

Removing 0 short texts over 2985 sentences...
Done.
<<< End prepare_dataset, Took: 5.6270 sec
>>> Start compute_embeddings

2025-01-10 16:47:43.370337: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)

INFO:tensorflow:Assets written to: use_model_kaggle/assets

INFO:tensorflow:Assets written to: use_model_kaggle/assets

<<< End compute_embeddings, Took: 46.6845 sec
>>> Start create_index_cosine
C contiguous problem solved
<<< End create_index_cosine, Took: 0.0091 sec
>>> Start find_matches

<<< End find_matches, Took: 0.5340 sec
>>> Start compute_duplicate_types
<<< End compute_duplicate_types, Took: 0.0900 sec
CPU times: user 6min 34s, sys: 47.6 s, total: 7min 21s
Wall time: 53.1 s

matches_synth['dup_type'].value_counts()

dup_type
copy-pasta     4252
rewording      3556
translation    2708
Name: count, dtype: int64

df_clusters.cluster.value_counts(dropna=False)

cluster
NaN      168
0.0       10
67.0      10
54.0      10
55.0      10
        ... 
299.0      2
305.0      2
306.0      2
312.0      2
303.0      2
Name: count, Length: 314, dtype: int64

df_annotated = pd.read_csv('../data/synthetic_dataset_pairs_unbalanced.csv', dtype=object)

df_annotated.head(3)

df_annotated.true_label.value_counts()

true_label
nomatch        1485000
translation       4500
copypasta         4030
rewording         4017
Name: count, dtype: int64

df_eval = (
    df_annotated.merge(
        matches_synth[['duplicates','source','target','dup_type','score','score_lev']], 
        left_on='source_target', 
        right_on='duplicates',
        how='left')
)

df_eval.loc[df_eval.dup_type.isnull(), "dup_type"] = 'nomatch'

pd.crosstab(df_eval.true_label, df_eval.dup_type, dropna=False).reindex(["copypasta", "rewording", "translation","nomatch"])[["copy-pasta", "rewording", "translation", 'nomatch']]

from pelote import tables_to_graph
from ipysigma import Sigma

def create_edges_nodes(matches, nodes_columns):
    # edges
    edges_plot = matches.copy()

    # nodes
    nodes_plot = (
        pd.concat([
            matches[['source'] + list(map(lambda x: x + '_source', nodes_columns))].rename(columns={'source':'id'}).rename(columns = lambda x: x.replace('_source', '')), 
            matches[['target'] + list(map(lambda x: x + '_target', nodes_columns))].rename(columns={'target':'id'}).rename(columns = lambda x: x.replace('_target', ''))], 
            ignore_index=True)
        .drop_duplicates('id')
    )
    nodes_plot["blank"] = " "
    return edges_plot, nodes_plot

edges, nodes = create_edges_nodes(matches_synth, ['text_to_embed','language'])

g = tables_to_graph(
    nodes.reset_index()[["id",'blank','text_to_embed','language']].astype(str),
    edges[["source","target","dup_type","score"]].astype(str), 
    node_col="id",
    node_data=["id",'blank','text_to_embed','language'],
    edge_data=['score',"dup_type"],
)

graph_sigma = Sigma(g, 
                    node_label="text_to_embed",
                    default_node_size =.5,
                    edge_color="dup_type",
                    default_edge_type="curve", 
                    node_border_color_from="node",
                    label_density=5,
                    label_rendered_size_threshold=0.0000001,
     )

graph_sigma

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
new_emb = model.encode(df_synth.rename(columns={'original': 'text_language_detect'})['text_language_detect'].values.tolist())
df_emb = pd.DataFrame(new_emb, index=df_synth.index)

matches, df_clusters = semantic_faiss(
    df=df_synth,
    min_size_txt=0,
    df_embeddings_use=df_emb,
    threshold_grapheme=0.693,
    threshold_language=0.715,
    threshold_semantic=0.85,
)

>>> Start prepare_dataset
Done.

Removing 0 short texts over 2985 sentences...
Done.
<<< End prepare_dataset, Took: 0.0040 sec
>>> Start create_index_cosine
<<< End create_index_cosine, Took: 0.0038 sec
>>> Start find_matches

<<< End find_matches, Took: 0.4862 sec
>>> Start compute_duplicate_types
<<< End compute_duplicate_types, Took: 0.1015 sec

g = tables_to_graph(
    nodes.reset_index()[["id",'blank','text_to_embed','language']].astype(str),
    edges[["source","target","dup_type","score"]].astype(str), 
    node_col="id",
    node_data=["id",'blank','text_to_embed','language'],
    edge_data=['score',"dup_type"],
)

graph_sigma = Sigma(g, 
                    node_label="text_to_embed",
                    default_node_size =.5,
                    edge_color="dup_type",
                    default_edge_type="curve", 
                    node_border_color_from="node",
                    label_density=5,
                    label_rendered_size_threshold=0.0000001,
     )

graph_sigma

Apply D3lta to a generated dataset¶

Synthetic dataset¶

D3lta use¶

Verification with true label¶

Graph¶

Other embeddings¶

	original	text_type	language	prompt	seed
doc_id
10	Voici que j'achève, avec ce roman, les cinq ouvrages qui m'ont été envoyés dans le cadre d'une opération Masse Critique privilégiée et c'est donc tout naturellement que je commence cette critique en remerciant babelio ainsi que les éditions Kennes car je suis vraiment contente d'avoir découvert leur nouvelle collections K, même si si je reste sur une mauvaise impression avec cette dernière lecture.	books	fr	NaN	True
11	With this novel, I've completed the five books sent to me as part of a special Critical Mass campaign, so it's only natural that I should start this review by thanking babelio and Kennes, because I'm really pleased to have discovered their new K series, even if I'm still left with a bad impression from this latest read.	books	en	NaN	False
12	Mit diesem Roman schließe ich die fünf Bücher ab, die mir im Rahmen einer Aktion "Masse Critique" zugeschickt wurden, und so ist es nur natürlich, dass ich diese Rezension mit einem Dank an babelio und den Kennes-Verlag beginne, denn ich bin wirklich froh, ihre neue K-Kollektion entdeckt zu haben, auch wenn ich bei der letzten Lektüre einen schlechten Eindruck hatte.	books	de	NaN	False
13	Com este romance, completei os cinco livros que me foram enviados no âmbito de uma campanha especial da Massa Crítica, por isso é natural que comece esta recensão agradecendo ao babelio e ao Kennes, porque estou muito contente por ter descoberto a sua nova série K, mesmo que ainda tenha ficado com uma má impressão desta última leitura.	books	pt	NaN	False
14	有了这本小说，我就完成了作为 "临界质量 "特别活动的一部分寄给我的五本书，因此，在这篇评论的开头，我自然要感谢 babelio 和 Kennes，因为我真的很高兴发现了他们的新 K 系列，尽管最近这本书给我留下了不好的印象。	books	zh	NaN	False

	source_target	source	target	original_source	original_target	language_source	language_target	true_label
0	10-11	10	11	Voici que j'achève, avec ce roman, les cinq ouvrages qui m'ont été envoyés dans le cadre d'une opération Masse Critique privilégiée et c'est donc tout naturellement que je commence cette critique en remerciant babelio ainsi que les éditions Kennes car je suis vraiment contente d'avoir découvert leur nouvelle collections K, même si si je reste sur une mauvaise impression avec cette dernière lecture.	With this novel, I've completed the five books sent to me as part of a special Critical Mass campaign, so it's only natural that I should start this review by thanking babelio and Kennes, because I'm really pleased to have discovered their new K series, even if I'm still left with a bad impression from this latest read.	fr	en	translation
1	10-12	10	12	Voici que j'achève, avec ce roman, les cinq ouvrages qui m'ont été envoyés dans le cadre d'une opération Masse Critique privilégiée et c'est donc tout naturellement que je commence cette critique en remerciant babelio ainsi que les éditions Kennes car je suis vraiment contente d'avoir découvert leur nouvelle collections K, même si si je reste sur une mauvaise impression avec cette dernière lecture.	Mit diesem Roman schließe ich die fünf Bücher ab, die mir im Rahmen einer Aktion "Masse Critique" zugeschickt wurden, und so ist es nur natürlich, dass ich diese Rezension mit einem Dank an babelio und den Kennes-Verlag beginne, denn ich bin wirklich froh, ihre neue K-Kollektion entdeckt zu haben, auch wenn ich bei der letzten Lektüre einen schlechten Eindruck hatte.	fr	de	translation
2	10-13	10	13	Voici que j'achève, avec ce roman, les cinq ouvrages qui m'ont été envoyés dans le cadre d'une opération Masse Critique privilégiée et c'est donc tout naturellement que je commence cette critique en remerciant babelio ainsi que les éditions Kennes car je suis vraiment contente d'avoir découvert leur nouvelle collections K, même si si je reste sur une mauvaise impression avec cette dernière lecture.	Com este romance, completei os cinco livros que me foram enviados no âmbito de uma campanha especial da Massa Crítica, por isso é natural que comece esta recensão agradecendo ao babelio e ao Kennes, porque estou muito contente por ter descoberto a sua nova série K, mesmo que ainda tenha ficado com uma má impressão desta última leitura.	fr	pt	translation

dup_type	copy-pasta	rewording	translation	nomatch
true_label
copypasta	3871	94	0	65
rewording	217	3147	0	653
translation	0	0	2708	1792
nomatch	0	0	0	1485000