atlassian-labs
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎internal/ptraceutil/go.mod‎
Lines changed: 4 additions & 4 deletions b/‎internal/ptraceutil/go.mod‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎internal/ptraceutil/go.sum‎
Lines changed: 12 additions & 12 deletions b/‎internal/ptraceutil/go.sum‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎pkg/processor/atlassiansamplingprocessor/DESIGN.md‎
Lines changed: 13 additions & 17 deletions b/‎pkg/processor/atlassiansamplingprocessor/DESIGN.md‎
Lines changed: 13 additions & 17 deletions
diff --git a/‎pkg/processor/atlassiansamplingprocessor/config.go‎
Lines changed: 18 additions & 0 deletions b/‎pkg/processor/atlassiansamplingprocessor/config.go‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎pkg/processor/atlassiansamplingprocessor/config_test.go‎
Lines changed: 36 additions & 6 deletions b/‎pkg/processor/atlassiansamplingprocessor/config_test.go‎
Lines changed: 36 additions & 6 deletions
diff --git a/‎pkg/processor/atlassiansamplingprocessor/decider.go‎
Lines changed: 2 additions & 3 deletions b/‎pkg/processor/atlassiansamplingprocessor/decider.go‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎pkg/processor/atlassiansamplingprocessor/documentation.md‎
Lines changed: 0 additions & 8 deletions b/‎pkg/processor/atlassiansamplingprocessor/documentation.md‎
Lines changed: 0 additions & 8 deletions
@@ -1,3 +1,4 @@
 .idea/
 /tools/
-test-reports/
+test-reports/
+**/coverage.out
@@ -3,9 +3,9 @@ module github.com/atlassian-labs/atlassian-sampling-processor/internal/ptraceuti
 go 1.24.0
 
 require (
-	github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest v0.126.0
+	github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest v0.128.0
 	github.com/stretchr/testify v1.10.0
-	go.opentelemetry.io/collector/pdata v1.32.0
+	go.opentelemetry.io/collector/pdata v1.34.0
 )
 
 require (
@@ -15,14 +15,14 @@ require (
 	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
-	github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.126.0 // indirect
+	github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.128.0 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	go.uber.org/multierr v1.11.0 // indirect
 	golang.org/x/net v0.39.0 // indirect
 	golang.org/x/sys v0.32.0 // indirect
 	golang.org/x/text v0.24.0 // indirect
 	google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a // indirect
-	google.golang.org/grpc v1.72.0 // indirect
+	google.golang.org/grpc v1.72.2 // indirect
 	google.golang.org/protobuf v1.36.6 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
@@ -29,12 +29,12 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
-github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden v0.126.0 h1:AnOgi0AF5kALP4hEILsQEnRzT/yNXfua598210Dn9ko=
-github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden v0.126.0/go.mod h1:jjyo4lLRH9WOUJ9djpEql6xqVAaReNDY7ciWRt23FZk=
-github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest v0.126.0 h1:9RPktK9IsZaHN5aGV+bA7UbGtZCDGWvkSLcldAIPD98=
-github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest v0.126.0/go.mod h1:ZgQQqwY9c/e3JleZPQ1xxm9ZbgEKpGVjBEP+D+fTM+s=
-github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.126.0 h1:FqfYYIBllbKMX2J64U37bVpICpo3+chXC3oC192fffM=
-github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.126.0/go.mod h1:j54xa94UWeLUNV1PXLm8QAlXCOqw6T8LOACb/qtZcug=
+github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden v0.128.0 h1:GJzARUS5NcCeYr7pwlrYMEK+fl92cmCDED2to7nPuCQ=
+github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden v0.128.0/go.mod h1:YrULw8EK8Vj0LX2ZhtfqMaIlLATIGOlbII9RDR8lPeI=
+github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest v0.128.0 h1:+rUULr4xqOJjZK3SokFmRYzsiPq5onoWoSv3He4aaus=
+github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest v0.128.0/go.mod h1:Fh2SXPeFkr4J97w9CV/apFAib8TC9Hi0P08xtiT7Lng=
+github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.128.0 h1:8OWwRSdIhm3DY3PEYJ0PtSEz1a1OjL0fghLXSr14JMk=
+github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.128.0/go.mod h1:32OeaysZe4vkSmD1LJ18Q1DfooryYqpSzFNmz+5A5RU=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
@@ -47,10 +47,10 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
 go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
-go.opentelemetry.io/collector/pdata v1.32.0 h1:hBzlJV1rujr1UdD2CBy2gmaIKtC15ysg/z+x8F3McQA=
-go.opentelemetry.io/collector/pdata v1.32.0/go.mod h1:m41io9nWpy7aCm/uD1L9QcKiZwOP0ldj83JEA34dmlk=
-go.opentelemetry.io/collector/pdata/pprofile v0.126.0 h1:ArYQxg5KdTb98r1X6KSZY7W6/4DPv/q6z7jSbSZ1mBc=
-go.opentelemetry.io/collector/pdata/pprofile v0.126.0/go.mod h1:2fBTFDcXjVfseBQKnt/DTM0EYTmFoPKtRpjg8ql38Ek=
+go.opentelemetry.io/collector/pdata v1.34.0 h1:2vwYftckXe7pWxI9mfSo+tw3wqdGNrYpMbDx/5q6rw8=
+go.opentelemetry.io/collector/pdata v1.34.0/go.mod h1:StPHMFkhLBellRWrULq0DNjv4znCDJZP6La4UuC+JHI=
+go.opentelemetry.io/collector/pdata/pprofile v0.128.0 h1:6DEtzs/liqv/ukz2EHbC5OMaj2V6K2pzuj/LaRg2YmY=
+go.opentelemetry.io/collector/pdata/pprofile v0.128.0/go.mod h1:bVVRpz+zKFf1UCCRUFqy8LvnO3tHlXKkdqW2d+Wi/iA=
 go.opentelemetry.io/otel v1.34.0 h1:zRLXxLCgL1WyKsPVrgbSdMN4c0FMkDAskSTQP+0hdUY=
 go.opentelemetry.io/otel v1.34.0/go.mod h1:OWFPOQ+h4G8xpyjgqo4SxJYdDQ/qmRH+wivy7zzx9oI=
 go.opentelemetry.io/otel/metric v1.34.0 h1:+eTR3U0MyfWjRDhmFMxe2SsW64QrZ84AOhvqS7Y+PoQ=
@@ -98,8 +98,8 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a h1:51aaUVRocpvUOSQKM6Q7VuoaktNIaMCLuhZB6DKksq4=
 google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a/go.mod h1:uRxBH1mhmO8PGhU89cMcHaXKZqO+OfakD8QQO0oYwlQ=
-google.golang.org/grpc v1.72.0 h1:S7UkcVa60b5AAQTaO6ZKamFp1zMZSU0fGDK2WZLbBnM=
-google.golang.org/grpc v1.72.0/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM=
+google.golang.org/grpc v1.72.2 h1:TdbGzwb82ty4OusHWepvFWGLgIbNo1/SUynEN0ssqv8=
+google.golang.org/grpc v1.72.2/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM=
 google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY=
 google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 
@@ -8,8 +8,8 @@ It also contains information about how to run in a production environment.
 
 This section describes, in order, the path a trace takes when consumed by this processor.
 
-1. `ConsumeTraces()` is invoked. This blocks on send to an unbuffered `chan`, and then returns.
-2. `consumeChan()` reads the `chan` and processes the traces.
+1. `ConsumeTraces()` is invoked. This organises the data by trace ID and shard, does an early decision check, sends to the shard `chan`, and then returns.
+2. `shardListener()` reads its assigned `chan` and processes the traces.
 3. The data is organised by trace ID, and the main loop in `processor.go` processes the data one trace ID at a time.
 4. The decision caches are accessed to determine if a sampling decision has already been made for the current trace ID. 
 If a prior decision exists, this allows us to streamline the processing. When the cache indicates that the trace has 
@@ -25,25 +25,21 @@ Least Recently Used (LRU) basis, meaning that adding new data to the cache may i
 least recently accessed trace (i.e. the trace that last received a new span the longest time ago). When a trace is 
 evicted, it is considered "not sampled" and added to the decision cache.
 
-## Synchronized Goroutine
+## Synchronization / Sharding
 
-The main operation of the processor is executed as a single goroutine, synchronized through an unbuffered channel.
+The main processing of this component is done by async goroutines (shard listeners) which read off "shards" (channels).
+Trace data is sharded in `ConsumeTraces()` before being sent to the appropriate shard for processing.
 
 In the collector architecture, receivers typically function as servers that accept and process data using multiple goroutines. 
 Consequently, processors like this one are invoked concurrently through the `ConsumeTraces()` method. 
-To ensure synchronization, the processor sends data to a channel, which is then received by a 
-dedicated goroutine (`consumeChan()`). This design guarantees that all data is processed by a single goroutine. 
-It draws inspiration from the core collector's batch processor.
-
-The decision to synchronize is primarily driven by the need to maintain the integrity of internal 
-caches while keeping the design simple. Allowing concurrent access to cached trace data would complicate the 
-code significantly and potentially lead to bugs, as experienced in the upstream tail sampling processor.
-
-This is, of course, a trade-off. The processing throughput is limited by the capacity of a single goroutine, 
-creating a potential bottleneck. This can be alleviated by deploying more instances of the processor with reduced 
-memory allocation per instance (e.g., more nodes, each with less memory). If the bottleneck becomes a significant issue, 
-a future enhancement could involve sharding the processor. This would involve splitting the processing workload by trace 
-ID and maintaining separate caches and states for each shard.
+To ensure synchronization, the processor sends data to channels, which is then received by a 
+dedicated shard listener. Spans belonging to the same trace will all be sent to the same shard, and that shard will
+be processed entirely synchronously by the same shard listener - this ensures data integrity because writes are limited 
+to these shard listeners. Each shard listener can be thought of "owning" a section of the caches.
+
+All shard listeners still access the same caches as each other, so a global lock is employed for any operations that may affect data cross-shard.
+The prime example of an operation like this, is the resizing of the caches, which can be performed by any shard listener, but may delete data
+belonging to a different shard listener. So, a stop-the-world kind of halt occurs briefly while the cache gets resized.
 
 ## Policies, and Policy Evaluation
 
 
@@ -9,6 +9,9 @@ import (
 )
 
 type Config struct {
+	// Shards controls how many goroutines consume the data incoming to the processor, sharded by trace ID. Default = 1.
+	Shards int `mapstructure:"shards"`
+
 	// PolicyConfig sets the tail-based sampling policy which makes a sampling decision
 	// for a given trace when requested.
 	PolicyConfig []PolicyConfig `mapstructure:"policies"`
@@ -37,18 +40,25 @@ type Config struct {
 
 	// CompressionEnabled compresses trace data in the primary and secondary caches if enabled
 	CompressionEnabled bool `mapstructure:"compression_enabled"`
+
+	// PreprocessBufferSize specifies the size of the chan that queues incoming trace data to be processed by the main loop.
+	// Default is 0, in which case an unbuffered channel is used.
+	PreprocessBufferSize int `mapstructure:"preprocess_buffer_size"`
 }
 
 var (
 	primaryCacheSizeError   = errors.New("primary_cache_size must be greater than 0")
 	secondaryCacheSizeError = errors.New("secondary_cache_size must be greater than 0 and less than 50% of primary_cache_size")
 	duplicatePolicyName     = errors.New("duplicate policy names found in sampling policy config")
+	invalidBufferSize       = errors.New("preprocess_buffer_size must be >= 0")
+	invalidShardCount       = errors.New("shards must be > 0")
 )
 
 var _ component.Config = (*Config)(nil)
 
 func createDefaultConfig() component.Config {
 	return &Config{
+		Shards:             1,
 		PrimaryCacheSize:   1000,
 		SecondaryCacheSize: 100,
 		DecisionCacheCfg: DecisionCacheCfg{
@@ -81,6 +91,14 @@ func (cfg *Config) Validate() (errors error) {
 		errors = multierr.Append(errors, secondaryCacheSizeError)
 	}
 
+	if cfg.PreprocessBufferSize < 0 {
+		errors = multierr.Append(errors, invalidBufferSize)
+	}
+
+	if cfg.Shards <= 0 {
+		errors = multierr.Append(errors, invalidShardCount)
+	}
+
 	err := validateUniquePolicyNames(cfg.PolicyConfig)
 	if err != nil {
 		errors = multierr.Append(errors, err)
 
@@ -40,12 +40,14 @@ func TestLoadConfig(t *testing.T) {
 	assert.Equal(t,
 		cfg,
 		&Config{
-			PrimaryCacheSize:   1000,
-			SecondaryCacheSize: 100,
-			TargetHeapBytes:    100_000_000,
-			RegulateCacheDelay: delay,
-			DecisionCacheCfg:   DecisionCacheCfg{SampledCacheSize: 1000, NonSampledCacheSize: 10000},
-			CompressionEnabled: true,
+			PrimaryCacheSize:     1000,
+			SecondaryCacheSize:   100,
+			TargetHeapBytes:      100_000_000,
+			RegulateCacheDelay:   delay,
+			DecisionCacheCfg:     DecisionCacheCfg{SampledCacheSize: 1000, NonSampledCacheSize: 10000},
+			CompressionEnabled:   true,
+			PreprocessBufferSize: 10,
+			Shards:               5,
 			PolicyConfig: []PolicyConfig{
 				{
 					SharedPolicyConfig: SharedPolicyConfig{
@@ -185,6 +187,7 @@ func TestValidate(t *testing.T) {
 			c: &Config{
 				PrimaryCacheSize:   100,
 				SecondaryCacheSize: 10,
+				Shards:             1,
 				PolicyConfig:       make([]PolicyConfig, 0),
 			},
 			expectedError: nil,
@@ -194,6 +197,7 @@ func TestValidate(t *testing.T) {
 			c: &Config{
 				PrimaryCacheSize:   0,
 				SecondaryCacheSize: 10,
+				Shards:             1,
 				PolicyConfig:       make([]PolicyConfig, 0),
 			},
 			expectedError: primaryCacheSizeError,
@@ -203,6 +207,7 @@ func TestValidate(t *testing.T) {
 			c: &Config{
 				PrimaryCacheSize:   10,
 				SecondaryCacheSize: 0,
+				Shards:             1,
 				PolicyConfig:       make([]PolicyConfig, 0),
 			},
 			expectedError: secondaryCacheSizeError,
@@ -212,6 +217,7 @@ func TestValidate(t *testing.T) {
 			c: &Config{
 				PrimaryCacheSize:   100,
 				SecondaryCacheSize: 50,
+				Shards:             1,
 				PolicyConfig:       make([]PolicyConfig, 0),
 			},
 			expectedError: nil,
@@ -221,13 +227,36 @@ func TestValidate(t *testing.T) {
 			c: &Config{
 				PrimaryCacheSize:   100,
 				SecondaryCacheSize: 55,
+				Shards:             1,
 				PolicyConfig:       make([]PolicyConfig, 0),
 			},
 			expectedError: secondaryCacheSizeError,
 		},
+		{
+			name: "Invalid buffer size",
+			c: &Config{
+				PrimaryCacheSize:     100,
+				SecondaryCacheSize:   10,
+				Shards:               1,
+				PolicyConfig:         make([]PolicyConfig, 0),
+				PreprocessBufferSize: -1,
+			},
+			expectedError: invalidBufferSize,
+		},
+		{
+			name: "Invalid shard number",
+			c: &Config{
+				PrimaryCacheSize:   100,
+				SecondaryCacheSize: 10,
+				PolicyConfig:       make([]PolicyConfig, 0),
+				Shards:             -1,
+			},
+			expectedError: invalidShardCount,
+		},
 		{
 			name: "No duplicate policy names",
 			c: &Config{
+				Shards:             1,
 				PrimaryCacheSize:   100,
 				SecondaryCacheSize: 10,
 				PolicyConfig: []PolicyConfig{
@@ -259,6 +288,7 @@ func TestValidate(t *testing.T) {
 			c: &Config{
 				PrimaryCacheSize:   100,
 				SecondaryCacheSize: 10,
+				Shards:             1,
 				PolicyConfig: []PolicyConfig{
 					{
 						SharedPolicyConfig: SharedPolicyConfig{
 
@@ -45,10 +45,9 @@ func (d *decider) MakeDecision(ctx context.Context, id pcommon.TraceID, currentT
 		if err != nil {
 			d.log.Warn("policy evaluation errored", zap.Error(err), zap.String("policy.name", p.name))
 		}
-		d.telemetry.ProcessorAtlassianSamplingPolicyDecisions.Add(ctx, 1, metric.WithAttributes(
+		d.telemetry.ProcessorAtlassianSamplingPolicyDecisions.Add(ctx, 1, metric.WithAttributeSet(attribute.NewSet(
 			attribute.String("policy", p.name),
-			attribute.String("decision", decision.String()),
-		))
+			attribute.String("decision", decision.String()))))
 
 		// Assume we have policy list [X, Y, Z],
 		// 1. Trace A/Span A is marked as low priority by a policy Z, we will set LastLowPriorityDecisionName to Z.
 
@@ -38,14 +38,6 @@ Number of spans that have been dropped due to an internal error
 | ---- | ----------- | ---------- | --------- |
 | {spans} | Sum | Int | true |
 
-### otelcol_processor_atlassian_sampling_overly_eager_lonely_root_span_decisions
-
-Number of spans that have been aggressively sampled out by root span policy
-
-| Unit | Metric Type | Value Type | Monotonic |
-| ---- | ----------- | ---------- | --------- |
-| {spans} | Sum | Int | true |
-
 ### otelcol_processor_atlassian_sampling_policy_decisions
 
 Sampling decisions made specifying policy and decision.