add metrics tags search support to prometheus

wololowarrior · wololowarrior · commit 426a3f55a256 · 2025-08-23T13:15:13.000+05:30
Signed-off-by: Harshil Gupta &lt;harshilgupta1808@gmail.com&gt;
diff --git a/cmd/query/app/query_parser.go b/cmd/query/app/query_parser.go
@@ -275,6 +275,15 @@ func (p *queryParser) parseMetricsQueryParams(r *http.Request) (bqp metricstore.
 	if err != nil {
 		return bqp, err
 	}
+
+	tags, err := p.parseTags(r.Form[tagParam], r.Form[tagsParam])
+	if err != nil {
+		return bqp, err
+	}
+	if len(tags) > 0 {
+		bqp.Tags = tags
+	}
+
 	bqp.EndTime = &endTs
 	bqp.Lookback = &lookback
 	bqp.Step = &step
diff --git a/cmd/query/app/query_parser_test.go b/cmd/query/app/query_parser_test.go
@@ -370,3 +370,25 @@ func TestParameterErrors(t *testing.T) {
 		})
 	}
 }
+
+func TestParseMetricsTags(t *testing.T) {
+	t.Run("simple tags only", func(t *testing.T) {
+		request, err := http.NewRequest(http.MethodGet, "x?service=foo&step=1000&tag=key1:value1,key2:value2", http.NoBody)
+		require.NoError(t, err)
+		parser := &queryParser{
+			timeNow: time.Now,
+		}
+		mqp, err := parser.parseMetricsQueryParams(request)
+		require.NoError(t, err)
+		assert.Equal(t, time.Second, *mqp.Step)
+	})
+	t.Run("malformed simple tag", func(t *testing.T) {
+		request, err := http.NewRequest(http.MethodGet, "x?service=foo&step=1000&tag=keyWithoutValue", http.NoBody)
+		require.NoError(t, err)
+		parser := &queryParser{
+			timeNow: time.Now,
+		}
+		_, err = parser.parseMetricsQueryParams(request)
+		require.Error(t, err)
+	})
+}
diff --git a/internal/storage/metricstore/prometheus/metricstore/reader.go b/internal/storage/metricstore/prometheus/metricstore/reader.go
@@ -51,6 +51,7 @@ type (
 		spanKindFilter string
 		serviceFilter  string
 		rate           string
+		tagFilters     []string
 	}
 
 	metricsQueryParams struct {
@@ -134,13 +135,25 @@ func (m MetricsReader) GetLatencies(ctx context.Context, requestParams *metricst
 		metricName:          "service_latencies",
 		metricDesc:          fmt.Sprintf("%.2fth quantile latency, grouped by service", requestParams.Quantile),
 		buildPromQuery: func(p promQueryParams) string {
+			// Build filter string including service_name, span_kind, and tags
+			filters := []string{fmt.Sprintf(`service_name =~ %q`, p.serviceFilter)}
+
+			if p.spanKindFilter != "" {
+				filters = append(filters, p.spanKindFilter)
+			}
+
+			// Add tag filters if there are any
+			if len(p.tagFilters) > 0 {
+				filters = append(filters, p.tagFilters...)
+			}
+
+			filterStr := strings.Join(filters, ", ")
+
 			return fmt.Sprintf(
-				// Note: p.spanKindFilter can be ""; trailing commas are okay within a timeseries selection.
-				`histogram_quantile(%.2f, sum(rate(%s_bucket{service_name =~ %q, %s}[%s])) by (%s))`,
+				`histogram_quantile(%.2f, sum(rate(%s_bucket{%s}[%s])) by (%s))`,
 				requestParams.Quantile,
 				m.latencyMetricName,
-				p.serviceFilter,
-				p.spanKindFilter,
+				filterStr,
 				p.rate,
 				p.groupBy,
 			)
@@ -177,12 +190,24 @@ func (m MetricsReader) GetCallRates(ctx context.Context, requestParams *metricst
 		metricName:          "service_call_rate",
 		metricDesc:          "calls/sec, grouped by service",
 		buildPromQuery: func(p promQueryParams) string {
+			// Build filter string including service_name, span_kind, and tags
+			filters := []string{fmt.Sprintf(`service_name =~ %q`, p.serviceFilter)}
+
+			if p.spanKindFilter != "" {
+				filters = append(filters, p.spanKindFilter)
+			}
+
+			// Add tag filters if there are any
+			if len(p.tagFilters) > 0 {
+				filters = append(filters, p.tagFilters...)
+			}
+
+			filterStr := strings.Join(filters, ", ")
+
 			return fmt.Sprintf(
-				// Note: p.spanKindFilter can be ""; trailing commas are okay within a timeseries selection.
-				`sum(rate(%s{service_name =~ %q, %s}[%s])) by (%s)`,
+				`sum(rate(%s{%s}[%s])) by (%s)`,
 				m.callsMetricName,
-				p.serviceFilter,
-				p.spanKindFilter,
+				filterStr,
 				p.rate,
 				p.groupBy,
 			)
@@ -211,11 +236,32 @@ func (m MetricsReader) GetErrorRates(ctx context.Context, requestParams *metrics
 		metricName:          "service_error_rate",
 		metricDesc:          "error rate, computed as a fraction of errors/sec over calls/sec, grouped by service",
 		buildPromQuery: func(p promQueryParams) string {
+			// Build base filters for all queries (service_name)
+			baseFilters := []string{fmt.Sprintf(`service_name =~ %q`, p.serviceFilter)}
+
+			// Add status_code filter only for error rate numerator, must be right after service_name to match test expectations
+			errorFilters := append([]string{}, baseFilters...)
+			errorFilters = append(errorFilters, `status_code = "STATUS_CODE_ERROR"`)
+
+			// Add span_kind filter
+			if p.spanKindFilter != "" {
+				baseFilters = append(baseFilters, p.spanKindFilter)
+				errorFilters = append(errorFilters, p.spanKindFilter)
+			}
+
+			// Add tag filters if there are any
+			if len(p.tagFilters) > 0 {
+				baseFilters = append(baseFilters, p.tagFilters...)
+				errorFilters = append(errorFilters, p.tagFilters...)
+			}
+
+			errorFilterStr := strings.Join(errorFilters, ", ")
+			baseFilterStr := strings.Join(baseFilters, ", ")
+
 			return fmt.Sprintf(
-				// Note: p.spanKindFilter can be ""; trailing commas are okay within a timeseries selection.
-				`sum(rate(%s{service_name =~ %q, status_code = "STATUS_CODE_ERROR", %s}[%s])) by (%s) / sum(rate(%s{service_name =~ %q, %s}[%s])) by (%s)`,
-				m.callsMetricName, p.serviceFilter, p.spanKindFilter, p.rate, p.groupBy,
-				m.callsMetricName, p.serviceFilter, p.spanKindFilter, p.rate, p.groupBy,
+				`sum(rate(%s{%s}[%s])) by (%s) / sum(rate(%s{%s}[%s])) by (%s)`,
+				m.callsMetricName, errorFilterStr, p.rate, p.groupBy,
+				m.callsMetricName, baseFilterStr, p.rate, p.groupBy,
 			)
 		},
 	}
@@ -308,11 +354,24 @@ func (m MetricsReader) buildPromQuery(metricsParams metricsQueryParams) string {
 	if len(metricsParams.SpanKinds) > 0 {
 		spanKindFilter = fmt.Sprintf(`span_kind =~ %q`, strings.Join(metricsParams.SpanKinds, "|"))
 	}
+
+	// Build tag filters
+	var tagFilters []string
+	if len(metricsParams.Tags) > 0 {
+		for k, v := range metricsParams.Tags {
+			// Escape dots in key names for Prometheus compatibility
+			escapedKey := strings.ReplaceAll(k, ".", "_")
+			tagFilters = append(tagFilters, fmt.Sprintf(`%s=%q`, escapedKey, v))
+		}
+	}
+
+	fmt.Println(">>>>>>>>> tagfilters", tagFilters)
 	promParams := promQueryParams{
 		serviceFilter:  strings.Join(metricsParams.ServiceNames, "|"),
 		spanKindFilter: spanKindFilter,
 		rate:           promqlDurationString(metricsParams.RatePer),
 		groupBy:        strings.Join(groupBy, ","),
+		tagFilters:     tagFilters,
 	}
 	return metricsParams.buildPromQuery(promParams)
 }
diff --git a/internal/storage/v1/api/metricstore/interface.go b/internal/storage/v1/api/metricstore/interface.go
@@ -45,6 +45,8 @@ type BaseQueryParameters struct {
 	RatePer *time.Duration
 	// SpanKinds is the list of span kinds to include (logical OR) in the resulting metrics aggregation.
 	SpanKinds []string
+	// Tags is a map of tag keys and values to filter the metrics by.
+	Tags map[string]string
 }
 
 // LatenciesQueryParameters contains the parameters required for latency metrics queries.

Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,8 @@ type BaseQueryParameters struct {`
`45`	`45`	`RatePer *time.Duration`
`46`	`46`	`// SpanKinds is the list of span kinds to include (logical OR) in the resulting metrics aggregation.`
`47`	`47`	`SpanKinds []string`
	`48`	`+ // Tags is a map of tag keys and values to filter the metrics by.`
	`49`	`+ Tags map[string]string`
`48`	`50`	`}`
`49`	`51`
`50`	`52`	`// LatenciesQueryParameters contains the parameters required for latency metrics queries.`