From 6fe96d41a28da36c605223ecb4c795634c125c22 Mon Sep 17 00:00:00 2001 From: LD-Reborn Date: Mon, 22 Dec 2025 00:18:13 +0100 Subject: [PATCH] Improved entity updating --- src/Server/Controllers/EntityController.cs | 5 +- src/Server/Entity.cs | 3 +- src/Server/Helper/SearchdomainHelper.cs | 229 ++++++++++++++------- src/Server/Searchdomain.cs | 2 +- src/Shared/Models/JSONModels.cs | 2 +- 5 files changed, 164 insertions(+), 77 deletions(-) diff --git a/src/Server/Controllers/EntityController.cs b/src/Server/Controllers/EntityController.cs index 0891ba5..3326310 100644 --- a/src/Server/Controllers/EntityController.cs +++ b/src/Server/Controllers/EntityController.cs @@ -55,10 +55,7 @@ public class EntityController : ControllerBase try { List? entities = _searchdomainHelper.EntitiesFromJSON( - [], - _domainManager.embeddingCache, - _domainManager.aIProvider, - _domainManager.helper, + _domainManager, _logger, JsonSerializer.Serialize(jsonEntities)); if (entities is not null && jsonEntities is not null) diff --git a/src/Server/Entity.cs b/src/Server/Entity.cs index edf4104..681b96b 100644 --- a/src/Server/Entity.cs +++ b/src/Server/Entity.cs @@ -1,9 +1,10 @@ namespace Server; -public class Entity(Dictionary attributes, Probmethods.probMethodDelegate probMethod, List datapoints, string name) +public class Entity(Dictionary attributes, Probmethods.probMethodDelegate probMethod, string probMethodName, List datapoints, string name) { public Dictionary attributes = attributes; public Probmethods.probMethodDelegate probMethod = probMethod; + public string probMethodName = probMethodName; public List datapoints = datapoints; public int id; public string name = name; diff --git a/src/Server/Helper/SearchdomainHelper.cs b/src/Server/Helper/SearchdomainHelper.cs index 7b137bd..7f18f7c 100644 --- a/src/Server/Helper/SearchdomainHelper.cs +++ b/src/Server/Helper/SearchdomainHelper.cs @@ -44,8 +44,12 @@ public class SearchdomainHelper(ILogger logger, DatabaseHelp return null; } - public List? EntitiesFromJSON(List entityCache, Dictionary> embeddingCache, AIProvider aIProvider, SQLHelper helper, ILogger logger, string json) + public List? EntitiesFromJSON(SearchdomainManager searchdomainManager, ILogger logger, string json) { + Dictionary> embeddingCache = searchdomainManager.embeddingCache; + AIProvider aIProvider = searchdomainManager.aIProvider; + SQLHelper helper = searchdomainManager.helper; + List? jsonEntities = JsonSerializer.Deserialize>(json); if (jsonEntities is null) { @@ -72,8 +76,7 @@ public class SearchdomainHelper(ILogger logger, DatabaseHelp ParallelOptions parallelOptions = new() { MaxDegreeOfParallelism = 16 }; // <-- This is needed! Otherwise if we try to index 100+ entities at once, it spawns 100 threads, exploding the SQL pool Parallel.ForEach(jsonEntities, parallelOptions, jSONEntity => { - using var tempHelper = helper.DuplicateConnection(); - var entity = EntityFromJSON(entityCache, embeddingCache, aIProvider, tempHelper, logger, jSONEntity); + var entity = EntityFromJSON(searchdomainManager, logger, jSONEntity); if (entity is not null) { retVal.Enqueue(entity); @@ -82,87 +85,173 @@ public class SearchdomainHelper(ILogger logger, DatabaseHelp return [.. retVal]; } - public Entity? EntityFromJSON(List entityCache, Dictionary> embeddingCache, AIProvider aIProvider, SQLHelper helper, ILogger logger, JSONEntity jsonEntity) //string json) + public Entity? EntityFromJSON(SearchdomainManager searchdomainManager, ILogger logger, JSONEntity jsonEntity) //string json) { - Dictionary> embeddingsLUT = []; // embeddingsLUT: hash -> model -> [embeddingValues * n] - int? preexistingEntityID = _databaseHelper.GetEntityID(helper, jsonEntity.Name, jsonEntity.Searchdomain); - if (preexistingEntityID is not null) + SQLHelper helper = searchdomainManager.helper.DuplicateConnection(); + Searchdomain searchdomain = searchdomainManager.GetSearchdomain(jsonEntity.Searchdomain); + List entityCache = searchdomain.entityCache; + AIProvider aIProvider = searchdomain.aIProvider; + Dictionary> embeddingCache = searchdomain.embeddingCache; + Entity? preexistingEntity = entityCache.FirstOrDefault(entity => entity.name == jsonEntity.Name); + + if (preexistingEntity is not null) { - lock (helper.connection) // TODO change this to helper and do A/B tests (i.e. before/after) + int? preexistingEntityID = _databaseHelper.GetEntityID(helper, jsonEntity.Name, jsonEntity.Searchdomain); + if (preexistingEntityID is null) { - Dictionary parameters = new() - { - { "id", preexistingEntityID } - }; - System.Data.Common.DbDataReader reader = helper.ExecuteSQLCommand("SELECT e.model, e.embedding, d.hash FROM datapoint d JOIN embedding e ON d.id = e.id_datapoint WHERE d.id_entity = @id", parameters); - while (reader.Read()) - { - string model = reader.GetString(0); - long length = reader.GetBytes(1, 0, null, 0, 0); - byte[] embeddingBytes = new byte[length]; - reader.GetBytes(1, 0, embeddingBytes, 0, (int)length); - float[] embeddingValues = FloatArrayFromBytes(embeddingBytes); - string hash = reader.GetString(2); - if (!embeddingsLUT.ContainsKey(hash)) - { - embeddingsLUT[hash] = []; - } - embeddingsLUT[hash].TryAdd(model, embeddingValues); - } - reader.Close(); + _logger.LogCritical("Unable to index entity {jsonEntity.Name} because it already exists in the searchdomain but not in the database.", [jsonEntity.Name]); + throw new Exception($"Unable to index entity {jsonEntity.Name} because it already exists in the searchdomain but not in the database."); } - _databaseHelper.RemoveEntity(entityCache, helper, jsonEntity.Name, jsonEntity.Searchdomain); // TODO only remove entity if there is actually a change somewhere. Perhaps create 3 datapoint lists to operate with: 1. delete, 2. update, 3. create - } - int id_entity = DatabaseHelper.DatabaseInsertEntity(helper, jsonEntity.Name, jsonEntity.Probmethod, _databaseHelper.GetSearchdomainID(helper, jsonEntity.Searchdomain)); - foreach (KeyValuePair attribute in jsonEntity.Attributes) - { - DatabaseHelper.DatabaseInsertAttribute(helper, attribute.Key, attribute.Value, id_entity); // TODO implement bulk insert to reduce number of queries - } - - List datapoints = []; - foreach (JSONDatapoint jsonDatapoint in jsonEntity.Datapoints) - { - string hash = Convert.ToBase64String(SHA256.HashData(Encoding.UTF8.GetBytes(jsonDatapoint.Text))); - Dictionary embeddings = []; - if (embeddingsLUT.ContainsKey(hash)) + Dictionary attributes = jsonEntity.Attributes; + + // Attribute + foreach (KeyValuePair attributesKV in preexistingEntity.attributes.ToList()) { - Dictionary hashLUT = embeddingsLUT[hash]; - foreach (string model in jsonDatapoint.Model) + string oldAttributeKey = attributesKV.Key; + string oldAttribute = attributesKV.Value; + bool newHasAttribute = jsonEntity.Attributes.TryGetValue(oldAttributeKey, out string? newAttribute); + if (newHasAttribute && newAttribute is not null && newAttribute != oldAttribute) { - if (hashLUT.ContainsKey(model)) + // Attribute - Updated + Dictionary parameters = new() { - embeddings.Add(model, hashLUT[model]); - } - else + { "newValue", newAttribute }, + { "entityId", preexistingEntityID }, + { "attribute", oldAttributeKey} + }; + helper.ExecuteSQLNonQuery("UPDATE attribute SET value=@newValue WHERE id_entity=@entityId AND attribute=@attribute", parameters); + preexistingEntity.attributes[oldAttributeKey] = newAttribute; + } else if (!newHasAttribute) + { + // Attribute - Deleted + Dictionary parameters = new() { - var additionalEmbeddings = Datapoint.GenerateEmbeddings(jsonDatapoint.Text, [model], aIProvider, embeddingCache); - embeddings.Add(model, additionalEmbeddings.First().Value); + { "entityId", preexistingEntityID }, + { "attribute", oldAttributeKey} + }; + helper.ExecuteSQLNonQuery("DELETE FROM attribute WHERE id_entity=@entityId AND attribute=@attribute", parameters); + preexistingEntity.attributes.Remove(oldAttributeKey); + } + } + foreach (var attributesKV in jsonEntity.Attributes) + { + string newAttributeKey = attributesKV.Key; + string newAttribute = attributesKV.Value; + bool preexistingHasAttribute = preexistingEntity.attributes.TryGetValue(newAttributeKey, out string? preexistingAttribute); + if (!preexistingHasAttribute) + { + // Attribute - New + DatabaseHelper.DatabaseInsertAttribute(helper, newAttributeKey, newAttribute, (int)preexistingEntityID); + preexistingEntity.attributes.Add(newAttributeKey, newAttribute); + } + } + + // Datapoint + foreach (Datapoint datapoint in preexistingEntity.datapoints.ToList()) + { + bool newEntityHasDatapoint = jsonEntity.Datapoints.Any(x => x.Name == datapoint.name); + if (!newEntityHasDatapoint) + { + // Datapoint - Deleted + Dictionary parameters = new() + { + { "datapointName", datapoint.name }, + { "entityId", preexistingEntityID} + }; + helper.ExecuteSQLNonQuery("DELETE e FROM embedding e JOIN datapoint d ON e.id_datapoint=d.id WHERE d.name=@datapointName AND d.id_entity=@entityId", parameters); + helper.ExecuteSQLNonQuery("DELETE FROM datapoint WHERE id_entity=@entityId AND name=@datapointName", parameters); + preexistingEntity.datapoints.Remove(datapoint); + } else + { + JSONDatapoint? newEntityDatapoint = jsonEntity.Datapoints.FirstOrDefault(x => x.Name == datapoint.name); + if (newEntityDatapoint is not null && newEntityDatapoint.Text is not null) + { + // Datapoint - Updated + Dictionary parameters = new() + { + { "datapointName", datapoint.name }, + { "entityId", preexistingEntityID} + }; + helper.ExecuteSQLNonQuery("DELETE e FROM embedding e JOIN datapoint d ON e.id_datapoint=d.id WHERE d.name=@datapointName AND d.id_entity=@entityId", parameters); + helper.ExecuteSQLNonQuery("DELETE FROM datapoint WHERE id_entity=@entityId AND name=@datapointName", parameters); + preexistingEntity.datapoints.Remove(datapoint); + Datapoint newDatapoint = DatabaseInsertDatapointWithEmbeddings(helper, searchdomain, newEntityDatapoint, (int)preexistingEntityID); + preexistingEntity.datapoints.Add(newDatapoint); + } } } - else + foreach (JSONDatapoint jsonDatapoint in jsonEntity.Datapoints) { - embeddings = Datapoint.GenerateEmbeddings(jsonDatapoint.Text, [.. jsonDatapoint.Model], aIProvider, embeddingCache); + bool oldEntityHasDatapoint = preexistingEntity.datapoints.Any(x => x.name == jsonDatapoint.Name); + if (!oldEntityHasDatapoint) + { + // Datapoint - New + Datapoint datapoint = BuildDatapointFromJsonDatapoint(jsonDatapoint, (int)preexistingEntityID, searchdomain); + preexistingEntity.datapoints.Add(datapoint); + } } - var probMethod_embedding = new ProbMethod(jsonDatapoint.Probmethod_embedding, logger) ?? throw new ProbMethodNotFoundException(jsonDatapoint.Probmethod_embedding); - var similarityMethod = new SimilarityMethod(jsonDatapoint.SimilarityMethod, logger) ?? throw new SimilarityMethodNotFoundException(jsonDatapoint.SimilarityMethod); - Datapoint datapoint = new(jsonDatapoint.Name, probMethod_embedding, similarityMethod, hash, [.. embeddings.Select(kv => (kv.Key, kv.Value))]); - int id_datapoint = DatabaseHelper.DatabaseInsertDatapoint(helper, jsonDatapoint.Name, jsonDatapoint.Probmethod_embedding, jsonDatapoint.SimilarityMethod, hash, id_entity); // TODO make this a bulk add action to reduce number of queries - List<(string model, byte[] embedding)> data = []; - foreach ((string, float[]) embedding in datapoint.embeddings) - { - data.Add((embedding.Item1, BytesFromFloatArray(embedding.Item2))); - } - DatabaseHelper.DatabaseInsertEmbeddingBulk(helper, id_datapoint, data); - datapoints.Add(datapoint); - } - var probMethod = Probmethods.GetMethod(jsonEntity.Probmethod) ?? throw new ProbMethodNotFoundException(jsonEntity.Probmethod); - Entity entity = new(jsonEntity.Attributes, probMethod, datapoints, jsonEntity.Name) + + return preexistingEntity; + } + else { - id = id_entity - }; - entityCache.Add(entity); - return entity; + int id_entity = DatabaseHelper.DatabaseInsertEntity(helper, jsonEntity.Name, jsonEntity.Probmethod, _databaseHelper.GetSearchdomainID(helper, jsonEntity.Searchdomain)); + foreach (KeyValuePair attribute in jsonEntity.Attributes) + { + DatabaseHelper.DatabaseInsertAttribute(helper, attribute.Key, attribute.Value, id_entity); // TODO implement bulk insert to reduce number of queries + } + + List datapoints = []; + foreach (JSONDatapoint jsonDatapoint in jsonEntity.Datapoints) + { + string hash = Convert.ToBase64String(SHA256.HashData(Encoding.UTF8.GetBytes(jsonDatapoint.Text))); + Datapoint datapoint = DatabaseInsertDatapointWithEmbeddings(helper, searchdomain, jsonDatapoint, id_entity, hash); + datapoints.Add(datapoint); + } + + var probMethod = Probmethods.GetMethod(jsonEntity.Probmethod) ?? throw new ProbMethodNotFoundException(jsonEntity.Probmethod); + Entity entity = new(jsonEntity.Attributes, probMethod, jsonEntity.Probmethod, datapoints, jsonEntity.Name) + { + id = id_entity + }; + entityCache.Add(entity); + return entity; + } + } + + public Datapoint DatabaseInsertDatapointWithEmbeddings(SQLHelper helper, Searchdomain searchdomain, JSONDatapoint jsonDatapoint, int id_entity, string? hash = null) + { + if (jsonDatapoint.Text is null) + { + throw new Exception("jsonDatapoint.Text must not be null at this point"); + } + hash ??= Convert.ToBase64String(SHA256.HashData(Encoding.UTF8.GetBytes(jsonDatapoint.Text))); + Datapoint datapoint = BuildDatapointFromJsonDatapoint(jsonDatapoint, id_entity, searchdomain, hash); + int id_datapoint = DatabaseHelper.DatabaseInsertDatapoint(helper, jsonDatapoint.Name, jsonDatapoint.Probmethod_embedding, jsonDatapoint.SimilarityMethod, hash, id_entity); // TODO make this a bulk add action to reduce number of queries + List<(string model, byte[] embedding)> data = []; + foreach ((string, float[]) embedding in datapoint.embeddings) + { + data.Add((embedding.Item1, BytesFromFloatArray(embedding.Item2))); + } + DatabaseHelper.DatabaseInsertEmbeddingBulk(helper, id_datapoint, data); + return datapoint; + } + + public Datapoint BuildDatapointFromJsonDatapoint(JSONDatapoint jsonDatapoint, int entityId, Searchdomain searchdomain, string? hash = null) + { + if (jsonDatapoint.Text is null) + { + throw new Exception("jsonDatapoint.Text must not be null at this point"); + } + using SQLHelper helper = searchdomain.helper.DuplicateConnection(); + Dictionary> embeddingCache = searchdomain.embeddingCache; + hash ??= Convert.ToBase64String(SHA256.HashData(Encoding.UTF8.GetBytes(jsonDatapoint.Text))); + DatabaseHelper.DatabaseInsertDatapoint(helper, jsonDatapoint.Name, jsonDatapoint.Probmethod_embedding, jsonDatapoint.SimilarityMethod, hash, entityId); + Dictionary embeddings = Datapoint.GenerateEmbeddings(jsonDatapoint.Text, [.. jsonDatapoint.Model], searchdomain.aIProvider, embeddingCache); + var probMethod_embedding = new ProbMethod(jsonDatapoint.Probmethod_embedding, logger) ?? throw new ProbMethodNotFoundException(jsonDatapoint.Probmethod_embedding); + var similarityMethod = new SimilarityMethod(jsonDatapoint.SimilarityMethod, logger) ?? throw new SimilarityMethodNotFoundException(jsonDatapoint.SimilarityMethod); + return new Datapoint(jsonDatapoint.Name, probMethod_embedding, similarityMethod, hash, [.. embeddings.Select(kv => (kv.Key, kv.Value))]); } } \ No newline at end of file diff --git a/src/Server/Searchdomain.cs b/src/Server/Searchdomain.cs index 7566363..1b0e0c3 100644 --- a/src/Server/Searchdomain.cs +++ b/src/Server/Searchdomain.cs @@ -142,7 +142,7 @@ public class Searchdomain Probmethods.probMethodDelegate? probmethod = Probmethods.GetMethod(probmethodString); if (datapoint_unassigned.TryGetValue(id, out List? datapoints) && probmethod is not null) { - Entity entity = new(attributes, probmethod, datapoints, name) + Entity entity = new(attributes, probmethod, probmethodString, datapoints, name) { id = id }; diff --git a/src/Shared/Models/JSONModels.cs b/src/Shared/Models/JSONModels.cs index 86ca7b3..593ae96 100644 --- a/src/Shared/Models/JSONModels.cs +++ b/src/Shared/Models/JSONModels.cs @@ -12,7 +12,7 @@ public class JSONEntity public class JSONDatapoint { public required string Name { get; set; } - public required string Text { get; set; } + public required string? Text { get; set; } public required string Probmethod_embedding { get; set; } public required string SimilarityMethod { get; set; } public required string[] Model { get; set; }