Merge branch 'profiling' into profiling_stripped_warming_queries

fragosoluana · fragosoluana · commit 1d01403cf601 · 2025-03-21T10:07:02.000-07:00
diff --git a/src/main/java/com/yelp/nrtsearch/server/grpc/LuceneServer.java b/src/main/java/com/yelp/nrtsearch/server/grpc/LuceneServer.java
@@ -248,6 +248,8 @@ private void registerMetrics(GlobalState globalState) {
     new ProcStatCollector().register(collectorRegistry);
     new MergeSchedulerCollector(globalState).register(collectorRegistry);
     new SearchResponseCollector(globalState).register(collectorRegistry);
+
+    CustomIndexingMetrics.register(collectorRegistry);
   }
 
   /** Main launches the server from the command line. */
diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/AddDocumentHandler.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/AddDocumentHandler.java
@@ -17,23 +17,33 @@
 
 import com.google.protobuf.ProtocolStringList;
 import com.yelp.nrtsearch.server.grpc.AddDocumentRequest;
+import com.yelp.nrtsearch.server.grpc.AddDocumentRequest.MultiValuedField;
 import com.yelp.nrtsearch.server.grpc.DeadlineUtils;
 import com.yelp.nrtsearch.server.grpc.FacetHierarchyPath;
+import com.yelp.nrtsearch.server.luceneserver.Handler.HandlerException;
 import com.yelp.nrtsearch.server.luceneserver.field.FieldDef;
 import com.yelp.nrtsearch.server.luceneserver.field.IdFieldDef;
 import com.yelp.nrtsearch.server.luceneserver.field.IndexableFieldDef;
+import com.yelp.nrtsearch.server.monitoring.CustomIndexingMetrics;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Queue;
+import java.util.Set;
 import java.util.concurrent.Callable;
 import java.util.concurrent.LinkedBlockingDeque;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;
 import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.Term;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -45,6 +55,13 @@ public class AddDocumentHandler {
    * context for the AddDocumentRequest including root document and optional child documents if
    * schema contains nested objects
    */
+  /*
+   constants matching elasticpipe , only needed for POC. to be deleted.
+  */
+  private static final String PARTIAL_UPDATE_KEY = "_is_partial_update";
+
+  private static final String PARTIAL_UPDATE_FIELDS = "_partial_update_fields";
+
   public static class DocumentsContext {
     private final Document rootDocument;
     private final Map<String, List<Document>> childDocuments;
@@ -77,8 +94,12 @@ public static DocumentsContext getDocumentsContext(
         AddDocumentRequest addDocumentRequest, IndexState indexState)
         throws AddDocumentHandlerException {
       DocumentsContext documentsContext = new DocumentsContext();
-      Map<String, AddDocumentRequest.MultiValuedField> fields = addDocumentRequest.getFieldsMap();
-      for (Map.Entry<String, AddDocumentRequest.MultiValuedField> entry : fields.entrySet()) {
+      Map<String, MultiValuedField> fields = addDocumentRequest.getFieldsMap();
+      for (Entry<String, MultiValuedField> entry : fields.entrySet()) {
+        if (entry.getKey().equals(PARTIAL_UPDATE_KEY)
+            || entry.getKey().equals(PARTIAL_UPDATE_FIELDS)) {
+          continue;
+        }
         parseOneField(entry.getKey(), entry.getValue(), documentsContext, indexState);
       }
 
@@ -116,7 +137,7 @@ private static void extractFieldNamesForDocument(Document document) {
     /** Parses a field's value, which is a MultiValuedField in all cases */
     private static void parseOneField(
         String fieldName,
-        AddDocumentRequest.MultiValuedField value,
+        MultiValuedField value,
         DocumentsContext documentsContext,
         IndexState indexState)
         throws AddDocumentHandlerException {
@@ -125,9 +146,7 @@ private static void parseOneField(
 
     /** Parse MultiValuedField for a single field, which is always a List<String>. */
     private static void parseMultiValueField(
-        FieldDef field,
-        AddDocumentRequest.MultiValuedField value,
-        DocumentsContext documentsContext)
+        FieldDef field, MultiValuedField value, DocumentsContext documentsContext)
         throws AddDocumentHandlerException {
       ProtocolStringList fieldValues = value.getValueList();
       List<FacetHierarchyPath> facetHierarchyPaths = value.getFaceHierarchyPathsList();
@@ -153,7 +172,7 @@ private static void parseMultiValueField(
     }
   }
 
-  public static class AddDocumentHandlerException extends Handler.HandlerException {
+  public static class AddDocumentHandlerException extends HandlerException {
     public AddDocumentHandlerException(String errorMessage) {
       super(errorMessage);
     }
@@ -181,6 +200,40 @@ public DocumentIndexer(
       this.indexName = indexName;
     }
 
+    private static boolean isPartialUpdate(AddDocumentRequest addDocumentRequest) {
+      return addDocumentRequest.getFieldsMap().containsKey(PARTIAL_UPDATE_KEY)
+          && Boolean.parseBoolean(
+              addDocumentRequest.getFieldsMap().get(PARTIAL_UPDATE_KEY).getValue(0));
+    }
+
+    private static Set<String> getPartialUpdateFields(AddDocumentRequest addDocumentRequest) {
+      Set<String> partialUpdateFields = new HashSet<>();
+      MultiValuedField field = addDocumentRequest.getFieldsMap().get(PARTIAL_UPDATE_FIELDS);
+      if (field != null) {
+        // For some weird reasons, the passed hashset from Elasticpipe like [inactive] , is coming
+        // literally as "[inactive]"
+        // and not as [inactive]. Which means that the beginning [ and ending ] are part of the
+        // string, whereas they should
+        // otherwise represent the hashset/list of items. So, we need to remove the first and last
+        // character from the string
+        List<String> cleansedValues =
+            field.getValueList().stream()
+                .map(value -> value.substring(1, value.length() - 1)) // Remove enclosing brackets
+                .flatMap(
+                    value -> {
+                      if (value.contains(",")) {
+                        return Arrays.stream(value.split(","));
+                      } else {
+                        return Stream.of(value);
+                      }
+                    })
+                .map(String::trim) // Trim each element
+                .collect(Collectors.toList());
+        partialUpdateFields.addAll(cleansedValues);
+      }
+      return partialUpdateFields;
+    }
+
     public long runIndexingJob() throws Exception {
       DeadlineUtils.checkDeadline("DocumentIndexer: runIndexingJob", "INDEXING");
 
@@ -192,16 +245,44 @@ public long runIndexingJob() throws Exception {
       IndexState indexState;
       ShardState shardState;
       IdFieldDef idFieldDef;
-
+      String ad_bid_id = "";
       try {
         indexState = globalState.getIndex(this.indexName);
         shardState = indexState.getShard(0);
         idFieldDef = indexState.getIdFieldDef().orElse(null);
         for (AddDocumentRequest addDocumentRequest : addDocumentRequestList) {
+          boolean partialUpdate = isPartialUpdate(addDocumentRequest);
+          final Set<String> partialUpdateFields;
+          if (partialUpdate) {
+            // removing all fields except rtb fields for the POC , for the actual implementation
+            // we will only be getting the fields that need to be updated
+            partialUpdateFields = getPartialUpdateFields(addDocumentRequest);
+            Map<String, MultiValuedField> docValueFields =
+                getDocValueFieldsForUpdateCall(addDocumentRequest, partialUpdateFields);
+            ad_bid_id = addDocumentRequest.getFieldsMap().get("ad_bid_id").getValue(0);
+            addDocumentRequest =
+                AddDocumentRequest.newBuilder().putAllFields(docValueFields).build();
+          } else {
+            partialUpdateFields = new HashSet<>();
+          }
+
           DocumentsContext documentsContext =
-              AddDocumentHandler.LuceneDocumentBuilder.getDocumentsContext(
-                  addDocumentRequest, indexState);
+              LuceneDocumentBuilder.getDocumentsContext(addDocumentRequest, indexState);
+
+          /*
+          if this is a partial update request, we need the only the partial update docValue fields from
+          documentcontext.
+           */
+          List<IndexableField> partialUpdateDocValueFields = new ArrayList<>();
+          if (partialUpdate) {
+            partialUpdateDocValueFields =
+                documentsContext.getRootDocument().getFields().stream()
+                    .filter(f -> partialUpdateFields.contains(f.name()))
+                    .toList();
+          }
+
           if (documentsContext.hasNested()) {
+            logger.info("Indexing nested documents for ad_bid_id: {}", ad_bid_id);
             try {
               if (idFieldDef != null) {
                 // update documents in the queue to keep order
@@ -222,7 +303,24 @@ public long runIndexingJob() throws Exception {
               throw new IOException(e);
             }
           } else {
-            documents.add(documentsContext.getRootDocument());
+            if (partialUpdate) {
+              CustomIndexingMetrics.updateDocValuesRequestsReceived.labels(indexName).inc();
+              Term term = new Term(idFieldDef.getName(), ad_bid_id);
+              // executing the partial update
+              logger.debug(
+                  "running a partial update for the ad_bid_id: {} and fields {} in the thread {}",
+                  ad_bid_id,
+                  partialUpdateDocValueFields,
+                  Thread.currentThread().getName() + Thread.currentThread().threadId());
+              long nanoTime = System.nanoTime();
+              shardState.writer.updateDocValues(
+                  term, partialUpdateDocValueFields.toArray(new Field[0]));
+              CustomIndexingMetrics.updateDocValuesLatency
+                  .labels(indexName)
+                  .set((System.nanoTime() - nanoTime));
+            } else {
+              documents.add(documentsContext.getRootDocument());
+            }
           }
         }
       } catch (Exception e) {
@@ -252,6 +350,15 @@ public long runIndexingJob() throws Exception {
       return shardState.writer.getMaxCompletedSequenceNumber();
     }
 
+    private static Map<String, MultiValuedField> getDocValueFieldsForUpdateCall(
+        AddDocumentRequest addDocumentRequest, Set<String> partialUpdateFields) {
+      Map<String, MultiValuedField> docValueFields =
+          addDocumentRequest.getFieldsMap().entrySet().stream()
+              .filter(e -> partialUpdateFields.contains(e.getKey()))
+              .collect(Collectors.toMap(Entry::getKey, Entry::getValue));
+      return docValueFields;
+    }
+
     /**
      * update documents with nested objects
      *
@@ -267,7 +374,7 @@ private void updateNestedDocuments(
         ShardState shardState)
         throws IOException {
       List<Document> documents = new ArrayList<>();
-      for (Map.Entry<String, List<Document>> e : documentsContext.getChildDocuments().entrySet()) {
+      for (Entry<String, List<Document>> e : documentsContext.getChildDocuments().entrySet()) {
         documents.addAll(
             e.getValue().stream()
                 .map(v -> handleFacets(indexState, shardState, v))
@@ -282,7 +389,12 @@ private void updateNestedDocuments(
       }
 
       documents.add(rootDoc);
+      CustomIndexingMetrics.addDocumentRequestsReceived.labels(indexName).inc();
+      long nanoTime = System.nanoTime();
       shardState.writer.updateDocuments(idFieldDef.getTerm(rootDoc), documents);
+      CustomIndexingMetrics.addDocumentLatency
+          .labels(indexName)
+          .set((System.nanoTime() - nanoTime));
     }
 
     /**
@@ -296,15 +408,20 @@ private void addNestedDocuments(
         DocumentsContext documentsContext, IndexState indexState, ShardState shardState)
         throws IOException {
       List<Document> documents = new ArrayList<>();
-      for (Map.Entry<String, List<Document>> e : documentsContext.getChildDocuments().entrySet()) {
+      for (Entry<String, List<Document>> e : documentsContext.getChildDocuments().entrySet()) {
         documents.addAll(
             e.getValue().stream()
                 .map(v -> handleFacets(indexState, shardState, v))
                 .collect(Collectors.toList()));
       }
       Document rootDoc = handleFacets(indexState, shardState, documentsContext.getRootDocument());
       documents.add(rootDoc);
+      CustomIndexingMetrics.addDocumentRequestsReceived.labels(indexName).inc();
+      long nanoTime = System.nanoTime();
       shardState.writer.addDocuments(documents);
+      CustomIndexingMetrics.addDocumentLatency
+          .labels(indexName)
+          .set((System.nanoTime() - nanoTime));
     }
 
     private void updateDocuments(
@@ -314,8 +431,13 @@ private void updateDocuments(
         ShardState shardState)
         throws IOException {
       for (Document nextDoc : documents) {
+        CustomIndexingMetrics.addDocumentRequestsReceived.labels(indexName).inc();
+        long nanoTime = System.nanoTime();
         nextDoc = handleFacets(indexState, shardState, nextDoc);
         shardState.writer.updateDocument(idFieldDef.getTerm(nextDoc), nextDoc);
+        CustomIndexingMetrics.addDocumentLatency
+            .labels(indexName)
+            .set((System.nanoTime() - nanoTime));
       }
     }
 
@@ -326,6 +448,8 @@ private void addDocuments(
         throw new IllegalStateException(
             "Adding documents to an index on a replica node is not supported");
       }
+      CustomIndexingMetrics.addDocumentRequestsReceived.labels(indexName).inc(documents.size());
+      long nanoTime = System.nanoTime();
       shardState.writer.addDocuments(
           (Iterable<Document>)
               () ->
@@ -349,6 +473,9 @@ public Document next() {
                       return nextDoc;
                     }
                   });
+      CustomIndexingMetrics.addDocumentLatency
+          .labels(indexName)
+          .set((System.nanoTime() - nanoTime) / documents.size());
     }
 
     private Document handleFacets(IndexState indexState, ShardState shardState, Document nextDoc) {
diff --git a/src/main/java/com/yelp/nrtsearch/server/monitoring/CustomIndexingMetrics.java b/src/main/java/com/yelp/nrtsearch/server/monitoring/CustomIndexingMetrics.java
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2025 Yelp Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.yelp.nrtsearch.server.monitoring;
+
+import io.prometheus.client.CollectorRegistry;
+import io.prometheus.client.Counter;
+import io.prometheus.client.Gauge;
+
+public class CustomIndexingMetrics {
+  public static final Counter updateDocValuesRequestsReceived =
+      Counter.build()
+          .name("update_doc_values_requests_received")
+          .help("Number of requests received for the update doc values API ")
+          .labelNames("index")
+          .create();
+
+  // counter for addDocument requests received for the index with the index name as the label value
+  public static final Counter addDocumentRequestsReceived =
+      Counter.build()
+          .name("add_document_requests_received")
+          .help("Number of requests received for the add document API ")
+          .labelNames("index")
+          .create();
+
+  public static final Gauge updateDocValuesLatency =
+      Gauge.build()
+          .name("update_doc_values_latency")
+          .help("Latency of the update doc values API")
+          .labelNames("index")
+          .create();
+
+  // gauge for the latency of the addDocument API with the index name as the label value
+  public static final Gauge addDocumentLatency =
+      Gauge.build()
+          .name("add_document_latency")
+          .help("Latency of the add document API")
+          .labelNames("index")
+          .create();
+
+  public static void register(CollectorRegistry registry) {
+    registry.register(updateDocValuesRequestsReceived);
+    registry.register(addDocumentRequestsReceived);
+    registry.register(updateDocValuesLatency);
+    registry.register(addDocumentLatency);
+  }
+}
diff --git a/src/test/java/com/yelp/nrtsearch/server/grpc/UpdateDocValuesTest.java b/src/test/java/com/yelp/nrtsearch/server/grpc/UpdateDocValuesTest.java

Original file line number	Diff line number	Diff line change
`@@ -248,6 +248,8 @@ private void registerMetrics(GlobalState globalState) {`
`248`	`248`	`new ProcStatCollector().register(collectorRegistry);`
`249`	`249`	`new MergeSchedulerCollector(globalState).register(collectorRegistry);`
`250`	`250`	`new SearchResponseCollector(globalState).register(collectorRegistry);`
	`251`	`+`
	`252`	`+ CustomIndexingMetrics.register(collectorRegistry);`
`251`	`253`	`}`
`252`	`254`
`253`	`255`	`/** Main launches the server from the command line. */`