diff --git a/app.go b/app.go
index 30e73c761..5cf8ec5e6 100755
--- a/app.go
+++ b/app.go
@@ -31,20 +31,21 @@ import (
"context"
"flag"
"fmt"
+ "github.com/fsnotify/fsnotify"
+ "github.com/shirou/gopsutil/v4/process"
+ "infini.sh/framework/core/task"
+ "infini.sh/framework/core/wrapper/taskset"
+ "infini.sh/framework/modules/configs/client"
"os"
"os/signal"
+ "path/filepath"
"runtime"
"runtime/debug"
"sync"
"syscall"
"time"
- "github.com/fsnotify/fsnotify"
- "github.com/shirou/gopsutil/v4/process"
- "infini.sh/framework/core/task"
- "infini.sh/framework/core/wrapper/taskset"
- "infini.sh/framework/modules/configs/client"
-
+ log "github.com/cihub/seelog"
"github.com/kardianos/service"
"infini.sh/framework/core/config"
"infini.sh/framework/core/daemon"
@@ -52,7 +53,6 @@ import (
"infini.sh/framework/core/errors"
"infini.sh/framework/core/global"
"infini.sh/framework/core/keystore"
- "infini.sh/framework/core/log"
_ "infini.sh/framework/core/logging"
"infini.sh/framework/core/logging/logger"
"infini.sh/framework/core/module"
@@ -85,6 +85,20 @@ type App struct {
svcFlag string
}
+func getServiceWorkingDirectory() string {
+ executablePath, err := os.Executable()
+ if err == nil {
+ // Services are often launched from a manager-controlled cwd. Use the executable directory so
+ // relative data/log/config paths resolve the same way for service installs and manual runs.
+ return filepath.Dir(executablePath)
+ }
+ workdir, err := os.Getwd()
+ if err != nil {
+ panic(err)
+ }
+ return workdir
+}
+
const (
env_SILENT_GREETINGS = "SILENT_GREETINGS"
env_SERVICE_NAME = "SERVICE_NAME"
@@ -575,10 +589,7 @@ func (app *App) Run() {
svcOptions["SuccessExitStatus"] = "1 2 8 SIGKILL"
svcOptions["LimitNOFILE"] = 1024000
- workdir, err := os.Getwd()
- if err != nil {
- panic(err)
- }
+ workdir := getServiceWorkingDirectory()
serviceName := app.environment.GetAppLowercaseName()
if v, ok := os.LookupEnv(env_SERVICE_NAME); ok {
diff --git a/app_test.go b/app_test.go
new file mode 100644
index 000000000..9c65bde87
--- /dev/null
+++ b/app_test.go
@@ -0,0 +1,43 @@
+// Copyright (C) INFINI Labs & INFINI LIMITED.
+//
+// The INFINI Framework is offered under the GNU Affero General Public License v3.0
+// and as commercial software.
+//
+// For commercial licensing, contact us at:
+// - Website: infinilabs.com
+// - Email: hello@infini.ltd
+//
+// Open Source licensed under AGPL V3:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package framework
+
+import (
+ "os"
+ "path/filepath"
+ "testing"
+)
+
+func TestGetServiceWorkingDirectoryUsesExecutableDir(t *testing.T) {
+ executablePath, err := os.Executable()
+ if err != nil {
+ t.Fatalf("failed to get executable path: %v", err)
+ }
+
+ got := getServiceWorkingDirectory()
+ want := filepath.Dir(executablePath)
+ if got != want {
+ t.Fatalf("expected service working directory %q, got %q", want, got)
+ }
+}
diff --git a/core/config/fs_watcher.go b/core/config/fs_watcher.go
index 45706c402..873dd0700 100644
--- a/core/config/fs_watcher.go
+++ b/core/config/fs_watcher.go
@@ -62,6 +62,36 @@ func loadConfigFile(file string) *Config {
return nil
}
+func dispatchConfigChangeEvent(ev fsnotify.Event, watcherCallbacks []CallbackFunc) {
+ for _, v := range watcherCallbacks {
+ v(ev.Name, ev.Op)
+ }
+
+ cfg := loadConfigFile(ev.Name)
+ if cfg != nil {
+ for _, k := range sectionCallbackOrder {
+ callbacks, ok := sectionCallbacks[k]
+ if !ok || !cfg.HasField(k) {
+ continue
+ }
+ currentCfg, err := cfg.Child(k, -1)
+ if err != nil {
+ log.Error(err)
+ continue
+ }
+ previousCfg, _ := latestConfig[k]
+ for _, f := range callbacks {
+ f(previousCfg, currentCfg)
+ }
+ latestConfig[k] = currentCfg
+ }
+ }
+
+ for _, v := range configCallbacks {
+ v(ev)
+ }
+}
+
var validExtensions = []string{".yml", ".yaml", ".tpl"}
func SetValidExtension(v []string) {
@@ -153,40 +183,7 @@ func AddPathToWatch(path string, callback CallbackFunc) {
time.Sleep(2 * time.Second)
log.Trace("2 seconds out, on:", ev.String())
- // AddPathToWatch
-
- for _, v := range watcher.callbacks {
- v(ev.Name, ev.Op)
- }
-
- // NotifyOnConfigChange
-
- for _, v := range configCallbacks {
- v(ev)
- }
-
- // NotifyOnConfigSectionChange
-
- cfg := loadConfigFile(ev.Name)
- if cfg == nil {
- continue
- }
-
- for k, v := range sectionCallbacks {
- if cfg.HasField(k) {
- currentCfg, err := cfg.Child(k, -1)
- if err != nil {
- log.Error(err)
- continue
- }
- // diff config
- previousCfg, _ := latestConfig[k]
- for _, f := range v {
- f(previousCfg, currentCfg)
- }
- latestConfig[k] = currentCfg
- }
- }
+ dispatchConfigChangeEvent(ev, watcher.callbacks)
}
}()
})
@@ -255,11 +252,13 @@ func StopWatchers() {
}
var sectionCallbacks = map[string][]func(pCfg, cCfg *Config){}
+var sectionCallbackOrder = []string{}
var configCallbacks = []func(fsnotify.Event){}
var cfgLocker = sync.RWMutex{}
// NotifyOnConfigSectionChange will trigger callback when any configuration file change detected and
-// configKey present in the changed file
+// configKey present in the changed file. Section callbacks run before generic NotifyOnConfigChange
+// callbacks so section-scoped state can be refreshed before dependent consumers reload.
func NotifyOnConfigSectionChange(configKey string, f func(pCfg, cCfg *Config)) {
cfgLocker.Lock()
defer cfgLocker.Unlock()
@@ -268,12 +267,14 @@ func NotifyOnConfigSectionChange(configKey string, f func(pCfg, cCfg *Config)) {
if !ok {
v = []func(pCfg, cCfg *Config){}
sectionCallbacks[configKey] = v
+ sectionCallbackOrder = append(sectionCallbackOrder, configKey)
}
v = append(v, f)
sectionCallbacks[configKey] = v
}
-// NotifyOnConfigChange will trigger callback when any configuration file change detected
+// NotifyOnConfigChange will trigger callback when any configuration file change detected, after any
+// matching NotifyOnConfigSectionChange callbacks for the same event have run.
func NotifyOnConfigChange(f func(fsnotify.Event)) {
cfgLocker.Lock()
defer cfgLocker.Unlock()
diff --git a/core/config/fs_watcher_test.go b/core/config/fs_watcher_test.go
new file mode 100644
index 000000000..5575f7812
--- /dev/null
+++ b/core/config/fs_watcher_test.go
@@ -0,0 +1,120 @@
+// Copyright (C) INFINI Labs & INFINI LIMITED.
+//
+// The INFINI Framework is offered under the GNU Affero General Public License v3.0
+// and as commercial software.
+//
+// For commercial licensing, contact us at:
+// - Website: infinilabs.com
+// - Email: hello@infini.ltd
+//
+// Open Source licensed under AGPL V3:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package config
+
+import (
+ "os"
+ "path/filepath"
+ "testing"
+
+ "github.com/fsnotify/fsnotify"
+)
+
+func TestDispatchConfigChangeEventRunsSectionCallbacksBeforeGenericCallbacks(t *testing.T) {
+ dir := t.TempDir()
+ file := filepath.Join(dir, "generated_metrics_tasks.yml")
+ content := []byte("elasticsearch:\n - id: \"cluster-1\"\n name: \"cluster-1\"\n enabled: true\n endpoint: \"http://127.0.0.1:9200\"\n")
+ if err := os.WriteFile(file, content, 0o644); err != nil {
+ t.Fatalf("write config file: %v", err)
+ }
+
+ previousSections := sectionCallbacks
+ previousOrder := sectionCallbackOrder
+ previousConfigs := configCallbacks
+ previousLatest := latestConfig
+ sectionCallbacks = map[string][]func(pCfg, cCfg *Config){}
+ sectionCallbackOrder = nil
+ configCallbacks = nil
+ latestConfig = map[string]*Config{}
+ t.Cleanup(func() {
+ sectionCallbacks = previousSections
+ sectionCallbackOrder = previousOrder
+ configCallbacks = previousConfigs
+ latestConfig = previousLatest
+ })
+
+ var order []string
+ NotifyOnConfigSectionChange("elasticsearch", func(pCfg, cCfg *Config) {
+ order = append(order, "section")
+ })
+ NotifyOnConfigChange(func(ev fsnotify.Event) {
+ order = append(order, "generic")
+ })
+
+ dispatchConfigChangeEvent(fsnotify.Event{Name: file, Op: fsnotify.Write}, nil)
+
+ if len(order) != 2 {
+ t.Fatalf("expected 2 callbacks, got %d (%v)", len(order), order)
+ }
+ if order[0] != "section" || order[1] != "generic" {
+ t.Fatalf("expected section callback before generic callback, got %v", order)
+ }
+}
+
+func TestDispatchConfigChangeEventRunsSectionCallbacksInRegistrationOrder(t *testing.T) {
+ dir := t.TempDir()
+ file := filepath.Join(dir, "gateway.yml")
+ content := []byte("flow:\n - name: flow-1\nrouter:\n - name: router-1\nentry:\n - name: entry-1\n")
+ if err := os.WriteFile(file, content, 0o644); err != nil {
+ t.Fatalf("write config file: %v", err)
+ }
+
+ previousSections := sectionCallbacks
+ previousOrder := sectionCallbackOrder
+ previousConfigs := configCallbacks
+ previousLatest := latestConfig
+ sectionCallbacks = map[string][]func(pCfg, cCfg *Config){}
+ sectionCallbackOrder = nil
+ configCallbacks = nil
+ latestConfig = map[string]*Config{}
+ t.Cleanup(func() {
+ sectionCallbacks = previousSections
+ sectionCallbackOrder = previousOrder
+ configCallbacks = previousConfigs
+ latestConfig = previousLatest
+ })
+
+ var order []string
+ NotifyOnConfigSectionChange("flow", func(pCfg, cCfg *Config) {
+ order = append(order, "flow")
+ })
+ NotifyOnConfigSectionChange("router", func(pCfg, cCfg *Config) {
+ order = append(order, "router")
+ })
+ NotifyOnConfigSectionChange("entry", func(pCfg, cCfg *Config) {
+ order = append(order, "entry")
+ })
+
+ dispatchConfigChangeEvent(fsnotify.Event{Name: file, Op: fsnotify.Write}, nil)
+
+ expected := []string{"flow", "router", "entry"}
+ if len(order) != len(expected) {
+ t.Fatalf("expected %d callbacks, got %d (%v)", len(expected), len(order), order)
+ }
+ for i, want := range expected {
+ if order[i] != want {
+ t.Fatalf("expected callback order %v, got %v", expected, order)
+ }
+ }
+}
diff --git a/core/elastic/actions.go b/core/elastic/actions.go
index dcc11b03e..5ff110ff2 100644
--- a/core/elastic/actions.go
+++ b/core/elastic/actions.go
@@ -118,10 +118,33 @@ func (node *NodeAvailable) IsDead() bool {
}
func (meta *ElasticsearchMetadata) IsAvailable() bool {
- if meta.Config == nil || !meta.Config.Enabled {
+ if meta.Config == nil {
+ if rate.GetRateLimiter("cluster_available_check", "nil_config", 1, 1, 30*time.Second).Allow() {
+ log.Debug("elasticsearch metadata is unavailable: config is nil")
+ }
return false
}
- return meta.clusterAvailable
+ if !meta.Config.Enabled {
+ clusterID := meta.Config.ID
+ if clusterID == "" {
+ clusterID = meta.Config.Name
+ }
+ if rate.GetRateLimiter("cluster_available_check", clusterID, 1, 1, 30*time.Second).Allow() {
+ log.Debugf("elasticsearch [%v] is unavailable: config disabled", meta.Config.Name)
+ }
+ return false
+ }
+ if !meta.clusterAvailable {
+ clusterID := meta.Config.ID
+ if clusterID == "" {
+ clusterID = meta.Config.Name
+ }
+ if rate.GetRateLimiter("cluster_available_check", clusterID, 1, 1, 30*time.Second).Allow() {
+ log.Debugf("elasticsearch [%v] is unavailable: clusterAvailable=false", meta.Config.Name)
+ }
+ return false
+ }
+ return true
}
func (meta *ElasticsearchMetadata) Init(health bool) {
@@ -186,13 +209,8 @@ func (meta *ElasticsearchMetadata) GetActiveEndpoint() string {
}
func (meta *ElasticsearchMetadata) GetActivePreferredSeedHost() string {
- hosts := meta.GetSeedHosts()
- if len(hosts) > 0 {
- for _, v := range hosts {
- if v != "" && IsHostAvailable(v) {
- return v
- }
- }
+ if host, _ := meta.getAvailableSeedHost(); host != "" {
+ return host
}
return meta.Config.Host
}
@@ -263,6 +281,12 @@ func (meta *ElasticsearchMetadata) GetActiveHosts() int {
}
func (meta *ElasticsearchMetadata) GetActiveHost() string {
+ if host, info := meta.getAvailableSeedHost(); host != "" {
+ if info != nil {
+ meta.activeHost = info
+ }
+ return host
+ }
if meta.activeHost != nil {
if meta.activeHost.IsAvailable() {
@@ -320,6 +344,25 @@ func (meta *ElasticsearchMetadata) GetActiveHost() string {
return hosts[0]
}
+func (meta *ElasticsearchMetadata) getAvailableSeedHost() (string, *NodeAvailable) {
+ hosts := meta.GetSeedHosts()
+ if hosts == nil || len(hosts) == 0 {
+ return "", nil
+ }
+
+ for _, host := range hosts {
+ if host == "" || !IsHostAvailable(host) {
+ continue
+ }
+ if info, ok := GetHostAvailableInfo(host); ok && info != nil && info.IsAvailable() {
+ return host, info
+ }
+ return host, nil
+ }
+
+ return "", nil
+}
+
func (meta *ElasticsearchMetadata) IsTLS() bool {
return meta.GetSchema() == "https"
}
diff --git a/core/elastic/actions_test.go b/core/elastic/actions_test.go
new file mode 100644
index 000000000..db096d30f
--- /dev/null
+++ b/core/elastic/actions_test.go
@@ -0,0 +1,113 @@
+// Copyright (C) INFINI Labs & INFINI LIMITED.
+//
+// The INFINI Framework is offered under the GNU Affero General Public License v3.0
+// and as commercial software.
+//
+// For commercial licensing, contact us at:
+// - Website: infinilabs.com
+// - Email: hello@infini.ltd
+//
+// Open Source licensed under AGPL V3:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package elastic
+
+import (
+ "testing"
+ "time"
+
+ "infini.sh/framework/core/orm"
+)
+
+func TestGetActiveHostPrefersAvailableSeedHostOverCachedDiscoveredHost(t *testing.T) {
+ const (
+ clusterID = "docker-mapped-port-cluster"
+ seedHost = "192.168.3.185:9211"
+ discoveredHost = "172.18.1.18:9200"
+ )
+
+ cfg := &ElasticsearchConfig{
+ ORMObjectBase: orm.ORMObjectBase{ID: clusterID},
+ Name: clusterID,
+ Host: seedHost,
+ Hosts: []string{seedHost},
+ Enabled: true,
+ }
+ cfg.Discovery.Enabled = true
+
+ meta := &ElasticsearchMetadata{
+ Config: cfg,
+ Nodes: &map[string]NodesInfo{
+ "node-1": {
+ Http: struct {
+ BoundAddress []string `json:"bound_address"`
+ PublishAddress string `json:"publish_address,omitempty"`
+ MaxContentLengthInBytes int64 `json:"max_content_length_in_bytes,omitempty"`
+ }{
+ PublishAddress: discoveredHost,
+ },
+ },
+ },
+ activeHost: &NodeAvailable{Host: discoveredHost, available: true, lastCheck: time.Now()},
+ }
+
+ hosts.Store(seedHost, &NodeAvailable{Host: seedHost, ClusterID: clusterID, available: true, lastCheck: time.Now()})
+ hosts.Store(discoveredHost, &NodeAvailable{Host: discoveredHost, ClusterID: clusterID, available: true, lastCheck: time.Now()})
+ t.Cleanup(func() {
+ hosts.Delete(seedHost)
+ hosts.Delete(discoveredHost)
+ })
+
+ got := meta.GetActiveHost()
+ if got != seedHost {
+ t.Fatalf("expected seed host %q to be preferred over discovered host %q, got %q", seedHost, discoveredHost, got)
+ }
+ if meta.activeHost == nil || meta.activeHost.Host != seedHost {
+ t.Fatalf("expected activeHost to be updated to seed host %q, got %#v", seedHost, meta.activeHost)
+ }
+}
+
+func TestGetActiveHostFallsBackToCachedDiscoveredHostWhenSeedUnavailable(t *testing.T) {
+ const (
+ clusterID = "docker-discovery-fallback-cluster"
+ seedHost = "192.168.3.185:9211"
+ discoveredHost = "172.18.1.18:9200"
+ )
+
+ cfg := &ElasticsearchConfig{
+ ORMObjectBase: orm.ORMObjectBase{ID: clusterID},
+ Name: clusterID,
+ Host: seedHost,
+ Hosts: []string{seedHost},
+ Enabled: true,
+ }
+ cfg.Discovery.Enabled = true
+
+ meta := &ElasticsearchMetadata{
+ Config: cfg,
+ activeHost: &NodeAvailable{Host: discoveredHost, available: true, lastCheck: time.Now()},
+ }
+
+ hosts.Store(seedHost, &NodeAvailable{Host: seedHost, ClusterID: clusterID, available: false, lastCheck: time.Now()})
+ hosts.Store(discoveredHost, &NodeAvailable{Host: discoveredHost, ClusterID: clusterID, available: true, lastCheck: time.Now()})
+ t.Cleanup(func() {
+ hosts.Delete(seedHost)
+ hosts.Delete(discoveredHost)
+ })
+
+ got := meta.GetActiveHost()
+ if got != discoveredHost {
+ t.Fatalf("expected discovered host %q when seed host is unavailable, got %q", discoveredHost, got)
+ }
+}
diff --git a/core/elastic/common_command.go b/core/elastic/common_command.go
index bc2cbdccf..0a4e9cd91 100644
--- a/core/elastic/common_command.go
+++ b/core/elastic/common_command.go
@@ -35,6 +35,7 @@ type CommonCommand struct {
ID string `json:"-" index:"id"`
Title string `json:"title" elastic_mapping:"title:{type:text,fields:{keyword:{type:keyword}}}"`
Tag []string `json:"tag" elastic_mapping:"tag:{type:keyword}"`
+ Creator string `json:"creator,omitempty" elastic_mapping:"creator:{type:keyword}"`
Requests []CommandRequest `json:"requests" elastic_mapping:"requests:{type:object}"`
Created time.Time `json:"created,omitempty" elastic_mapping:"created:{type:date}"`
}
diff --git a/core/elastic/domain_actions.go b/core/elastic/domain_actions.go
index d40bceadf..90e84f388 100644
--- a/core/elastic/domain_actions.go
+++ b/core/elastic/domain_actions.go
@@ -99,8 +99,14 @@ func RegisterInstance(cfg ElasticsearchConfig, handler API) {
UpdateClient(cfg, handler)
UpdateConfig(cfg)
+ meta := GetMetadata(cfg.ID)
+ if meta == nil {
+ InitMetadata(&cfg, false)
+ return
+ }
+
if exists && oldCfg != nil {
- InitMetadata(&cfg, true)
+ InitMetadata(&cfg, meta.IsAvailable())
}
}
diff --git a/core/elastic/domain_actions_test.go b/core/elastic/domain_actions_test.go
new file mode 100644
index 000000000..3132a15c5
--- /dev/null
+++ b/core/elastic/domain_actions_test.go
@@ -0,0 +1,58 @@
+// Copyright (C) INFINI Labs & INFINI LIMITED.
+//
+// The INFINI Framework is offered under the GNU Affero General Public License v3.0
+// and as commercial software.
+//
+// For commercial licensing, contact us at:
+// - Website: infinilabs.com
+// - Email: hello@infini.ltd
+//
+// Open Source licensed under AGPL V3:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package elastic
+
+import (
+ "testing"
+
+ "infini.sh/framework/core/orm"
+)
+
+func TestRegisterInstanceInitializesMetadataOnFirstRegistration(t *testing.T) {
+ cfg := ElasticsearchConfig{
+ ORMObjectBase: orm.ORMObjectBase{ID: "test-first-sync"},
+ Name: "test-first-sync",
+ Enabled: true,
+ ClusterUUID: "cluster-uuid-1",
+ }
+
+ t.Cleanup(func() {
+ cfgs.Delete(cfg.ID)
+ apis.Delete(cfg.ID)
+ metas.Delete(cfg.ID)
+ })
+
+ RegisterInstance(cfg, nil)
+
+ meta := GetMetadata(cfg.ID)
+ if meta == nil {
+ t.Fatalf("expected metadata to be initialized for %s", cfg.ID)
+ }
+ if meta.Config == nil {
+ t.Fatalf("expected metadata config to be initialized for %s", cfg.ID)
+ }
+ if meta.Config.ClusterUUID != cfg.ClusterUUID {
+ t.Fatalf("expected cluster uuid %q, got %q", cfg.ClusterUUID, meta.Config.ClusterUUID)
+ }
+}
diff --git a/core/elastic/index.go b/core/elastic/index.go
index 6ca476b22..27768169e 100755
--- a/core/elastic/index.go
+++ b/core/elastic/index.go
@@ -24,10 +24,13 @@
package elastic
import (
+ "bytes"
"errors"
"github.com/buger/jsonparser"
"github.com/segmentio/encoding/json"
"infini.sh/framework/core/util"
+ "sort"
+ "strconv"
"strings"
"time"
)
@@ -217,6 +220,67 @@ type AggregationResponse struct {
Value interface{} `json:"value,omitempty"`
}
+func (a *AggregationResponse) UnmarshalJSON(data []byte) error {
+ type alias struct {
+ Buckets json.RawMessage `json:"buckets,omitempty"`
+ Value interface{} `json:"value,omitempty"`
+ }
+
+ var aux alias
+ if err := json.Unmarshal(data, &aux); err != nil {
+ return err
+ }
+ a.Value = aux.Value
+
+ buckets := bytes.TrimSpace(aux.Buckets)
+ if len(buckets) == 0 || bytes.Equal(buckets, []byte("null")) {
+ a.Buckets = nil
+ return nil
+ }
+
+ switch buckets[0] {
+ case '[':
+ return json.Unmarshal(buckets, &a.Buckets)
+ case '{':
+ keyedBuckets := map[string]BucketBase{}
+ if err := json.Unmarshal(buckets, &keyedBuckets); err != nil {
+ return err
+ }
+
+ keys := make([]string, 0, len(keyedBuckets))
+ for key := range keyedBuckets {
+ keys = append(keys, key)
+ }
+ sort.Slice(keys, func(i, j int) bool {
+ return compareBucketKeys(keys[i], keys[j])
+ })
+
+ a.Buckets = make([]BucketBase, 0, len(keys))
+ for _, key := range keys {
+ bucket := keyedBuckets[key]
+ if bucket == nil {
+ bucket = BucketBase{}
+ }
+ if _, ok := bucket["key"]; !ok {
+ bucket["key"] = key
+ }
+ a.Buckets = append(a.Buckets, bucket)
+ }
+ return nil
+ default:
+ return nil
+ }
+}
+
+func compareBucketKeys(left, right string) bool {
+ leftInt, leftErr := strconv.ParseInt(left, 10, 64)
+ rightInt, rightErr := strconv.ParseInt(right, 10, 64)
+ if leftErr == nil && rightErr == nil {
+ return leftInt < rightInt
+ }
+ return left < right
+}
+
type ResponseBase struct {
RawResult *util.Result `json:"-"`
StatusCode int `json:"-"`
@@ -235,6 +299,48 @@ type ErrorDetail struct {
Reason string `json:"reason,omitempty"`
}
+func (d *ErrorDetail) UnmarshalJSON(data []byte) error {
+ data = bytes.TrimSpace(data)
+ if len(data) == 0 || bytes.Equal(data, []byte("null")) {
+ return nil
+ }
+
+ if len(data) > 0 && data[0] == '"' {
+ return json.Unmarshal(data, &d.Reason)
+ }
+
+ type alias ErrorDetail
+ var aux alias
+ if err := json.Unmarshal(data, &aux); err != nil {
+ return err
+ }
+ *d = ErrorDetail(aux)
+ return nil
+}
+
+func (d *ErrorDetail) Message() string {
+ if d == nil {
+ return ""
+ }
+ if d.Reason != "" {
+ return d.Reason
+ }
+ if len(d.RootCause) > 0 {
+ var reasons []string
+ for _, cause := range d.RootCause {
+ if cause.Reason != "" {
+ reasons = append(reasons, cause.Reason)
+ } else if cause.Type != "" {
+ reasons = append(reasons, cause.Type)
+ }
+ }
+ if len(reasons) > 0 {
+ return strings.Join(reasons, "; ")
+ }
+ }
+ return d.Type
+}
+
type RootCause struct {
Type string `json:"type,omitempty"`
Reason string `json:"reason,omitempty"`
diff --git a/core/elastic/index_test.go b/core/elastic/index_test.go
index d36ae4d1c..399a9d6bc 100644
--- a/core/elastic/index_test.go
+++ b/core/elastic/index_test.go
@@ -25,8 +25,27 @@ package elastic
import (
"testing"
+
+ "github.com/segmentio/encoding/json"
)
+func TestAggregationResponseUnmarshalKeyedBuckets(t *testing.T) {
+ var agg AggregationResponse
+ err := json.Unmarshal([]byte(`{"buckets":{"0":{"doc_count":1740269},"1":{"doc_count":42}}}`), &agg)
+ if err != nil {
+ t.Fatalf("unexpected unmarshal error: %v", err)
+ }
+ if len(agg.Buckets) != 2 {
+ t.Fatalf("unexpected bucket count: %d", len(agg.Buckets))
+ }
+ if agg.Buckets[0]["key"] != "0" || agg.Buckets[0]["doc_count"] != float64(1740269) {
+ t.Fatalf("unexpected first bucket: %#v", agg.Buckets[0])
+ }
+ if agg.Buckets[1]["key"] != "1" || agg.Buckets[1]["doc_count"] != float64(42) {
+ t.Fatalf("unexpected second bucket: %#v", agg.Buckets[1])
+ }
+}
+
func TestIndexDocument_GetStringFieldFromSource(t *testing.T) {
tests := []struct {
name string
@@ -220,3 +239,31 @@ func TestIndexDocument_TryGetStringFieldFromSource(t *testing.T) {
})
}
}
+
+func TestErrorDetailUnmarshalJSONString(t *testing.T) {
+ var detail ErrorDetail
+ if err := json.Unmarshal([]byte(`"initializing"`), &detail); err != nil {
+ t.Fatalf("unexpected unmarshal error: %v", err)
+ }
+
+ if detail.Reason != "initializing" {
+ t.Fatalf("unexpected reason: %q", detail.Reason)
+ }
+ if detail.Message() != "initializing" {
+ t.Fatalf("unexpected message: %q", detail.Message())
+ }
+}
+
+func TestErrorDetailUnmarshalJSONObject(t *testing.T) {
+ var detail ErrorDetail
+ if err := json.Unmarshal([]byte(`{"type":"search_phase_execution_exception","reason":"all shards failed"}`), &detail); err != nil {
+ t.Fatalf("unexpected unmarshal error: %v", err)
+ }
+
+ if detail.Type != "search_phase_execution_exception" {
+ t.Fatalf("unexpected type: %q", detail.Type)
+ }
+ if detail.Message() != "all shards failed" {
+ t.Fatalf("unexpected message: %q", detail.Message())
+ }
+}
diff --git a/core/elastic/partition.go b/core/elastic/partition.go
index 63d474fb1..4169ca9e9 100644
--- a/core/elastic/partition.go
+++ b/core/elastic/partition.go
@@ -32,6 +32,7 @@ import (
"fmt"
"math"
"net/http"
+ "sort"
"strconv"
"strings"
@@ -40,12 +41,14 @@ import (
)
type PartitionQuery struct {
- IndexName string `json:"index_name"`
- FieldType string `json:"field_type"`
- FieldName string `json:"field_name"`
- Step interface{} `json:"step"`
- Filter interface{} `json:"filter"`
- DocType string `json:"doc_type"`
+ IndexName string `json:"index_name"`
+ FieldType string `json:"field_type"`
+ FieldName string `json:"field_name"`
+ Strategy string `json:"strategy,omitempty"`
+ Step interface{} `json:"step,omitempty"`
+ PartitionCount int `json:"partition_count,omitempty"`
+ Filter interface{} `json:"filter"`
+ DocType string `json:"doc_type"`
}
type PartitionInfo struct {
@@ -54,6 +57,8 @@ type PartitionInfo struct {
End float64 `json:"end"`
Filter map[string]interface{} `json:"filter"`
Docs int64 `json:"docs"`
+ Label string `json:"label,omitempty"`
+ Values []string `json:"values,omitempty"`
Other bool
}
@@ -68,6 +73,11 @@ const (
PartitionByDate = "date"
PartitionByKeyword = "keyword"
PartitionByNumber = "number"
+
+ PartitionStrategyStep = "step"
+ PartitionStrategyQuantile = "quantile"
+ PartitionStrategyTerms = "terms"
+ PartitionStrategyHash = "hash"
)
func GetPartitions(q *PartitionQuery, client API) ([]PartitionInfo, error) {
@@ -100,32 +110,6 @@ func GetPartitions(q *PartitionQuery, client API) ([]PartitionInfo, error) {
switch q.FieldType {
case PartitionByDate, PartitionByNumber:
- var step float64
- if q.FieldType == PartitionByDate {
- if stepV, ok := q.Step.(string); !ok {
- return nil, fmt.Errorf("expect step value of string type since filedtype is %s", PartitionByDate)
- } else {
- du, err := util.ParseDuration(stepV)
- if err != nil {
- return nil, fmt.Errorf("parse step duration error: %w", err)
- }
- step = float64(du.Milliseconds())
- }
- } else {
- switch q.Step.(type) {
- case float64:
- step = q.Step.(float64)
- case string:
- v, err := strconv.Atoi(q.Step.(string))
- if err != nil {
- return nil, fmt.Errorf("convert step error: %w", err)
- }
- step = float64(v)
- default:
- return nil, fmt.Errorf("invalid parameter step: %v", q.Step)
- }
- }
-
result, err := getBoundValues(client, q.IndexName, q.FieldName, vFilter)
if err != nil {
return nil, err
@@ -138,23 +122,110 @@ func GetPartitions(q *PartitionQuery, client API) ([]PartitionInfo, error) {
var (
partitions []PartitionInfo
)
- partitions, err = getPartitionsByAgg(client, q.IndexName, q.FieldName, q.FieldType, step, vFilter)
- if err != nil {
- return nil, err
+
+ switch normalizePartitionStrategy(q.Strategy) {
+ case PartitionStrategyStep:
+ step, err := parsePartitionStep(q.FieldType, q.Step)
+ if err != nil {
+ return nil, err
+ }
+ partitions, err = getPartitionsByAgg(client, q.IndexName, q.FieldName, q.FieldType, step, vFilter)
+ if err != nil {
+ return nil, err
+ }
+ case PartitionStrategyQuantile:
+ partitions, err = getPartitionsByQuantile(client, q.IndexName, q.FieldName, q.FieldType, q.PartitionCount, result.Min, result.Max, vFilter)
+ if err != nil {
+ return nil, err
+ }
+ default:
+ return nil, fmt.Errorf("unsupported partition strategy: %s", q.Strategy)
}
+
if result.Null > 0 {
partitions = append(partitions, PartitionInfo{
Filter: result.NotExistsFilter,
Other: true,
+ Label: "Missing values",
Docs: result.Null,
})
}
return partitions, nil
+ case PartitionByKeyword:
+ var (
+ partitions []PartitionInfo
+ err error
+ )
+ switch normalizePartitionStrategy(q.Strategy) {
+ case PartitionStrategyTerms:
+ partitions, err = getPartitionsByTerms(client, q.IndexName, q.FieldName, q.PartitionCount, vFilter)
+ if err != nil {
+ return nil, err
+ }
+ case PartitionStrategyHash:
+ partitions, err = getPartitionsByHash(client, q.IndexName, q.FieldName, q.PartitionCount, vFilter)
+ if err != nil {
+ return nil, err
+ }
+ default:
+ return nil, fmt.Errorf("unsupported partition strategy: %s", q.Strategy)
+ }
+
+ missingPartition, err := getMissingPartition(client, q.IndexName, q.FieldName, vFilter)
+ if err != nil {
+ return nil, err
+ }
+ if missingPartition != nil {
+ partitions = append(partitions, *missingPartition)
+ }
+ return partitions, nil
default:
return nil, fmt.Errorf("unsupported field type: %s", q.FieldType)
}
}
+func normalizePartitionStrategy(strategy string) string {
+ switch strings.ToLower(strings.TrimSpace(strategy)) {
+ case "", PartitionStrategyStep:
+ return PartitionStrategyStep
+ case PartitionStrategyQuantile:
+ return PartitionStrategyQuantile
+ case PartitionStrategyTerms:
+ return PartitionStrategyTerms
+ case PartitionStrategyHash:
+ return PartitionStrategyHash
+ default:
+ return strings.ToLower(strings.TrimSpace(strategy))
+ }
+}
+
+func parsePartitionStep(fieldType string, stepValue interface{}) (float64, error) {
+ if fieldType == PartitionByDate {
+ stepV, ok := stepValue.(string)
+ if !ok {
+ return 0, fmt.Errorf("expect step value of string type since filedtype is %s", PartitionByDate)
+ }
+ du, err := util.ParseDuration(stepV)
+ if err != nil {
+ return 0, fmt.Errorf("parse step duration error: %w", err)
+ }
+ return float64(du.Milliseconds()), nil
+ }
+
+ switch stepValue.(type) {
+ case float64:
+ return stepValue.(float64), nil
+ case string:
+ v, err := strconv.Atoi(stepValue.(string))
+ if err != nil {
+ return 0, fmt.Errorf("convert step error: %w", err)
+ }
+ return float64(v), nil
+ default:
+ return 0, fmt.Errorf("invalid parameter step: %v", stepValue)
+ }
+}
+
func getPartitionsByAgg(client API, indexName string, fieldName, fieldType string, step float64, filter interface{}) ([]PartitionInfo, error) {
queryDsl := util.MapStr{
"size": 0,
@@ -182,7 +253,7 @@ func getPartitionsByAgg(client API, indexName string, fieldName, fieldType strin
if filter != nil {
queryDsl["query"] = filter
}
- res, err := client.SearchWithRawQueryDSL(indexName, util.MustToJSONBytes(queryDsl))
+ res, err := searchPartitionWithRawQueryDSL(client, indexName, queryDsl)
if err != nil {
return nil, err
}
@@ -217,13 +288,406 @@ func getPartitionsByAgg(client API, indexName string, fieldName, fieldType strin
Docs: int64(docCount),
Other: false,
}
- partition.Filter = buildPartitionFilter(min, max, fieldName, fieldType, filter)
+ partition.Filter = buildBoundedPartitionFilter(min, max, fieldName, fieldType, filter)
partitions = append(partitions, partition)
}
}
return partitions, nil
}
+func getPartitionsByQuantile(client API, indexName string, fieldName, fieldType string, partitionCount int, min, max float64, filter interface{}) ([]PartitionInfo, error) {
+ if partitionCount <= 0 {
+ return nil, fmt.Errorf("invalid parameter partition_count: %d", partitionCount)
+ }
+
+ boundaries, err := getQuantileBoundaries(client, indexName, fieldName, partitionCount, min, max, filter)
+ if err != nil {
+ return nil, err
+ }
+ partitions := buildQuantilePartitions(boundaries, fieldName, fieldType, filter)
+ if len(partitions) == 0 {
+ return nil, nil
+ }
+
+ counts, err := getPartitionDocCounts(client, indexName, partitions)
+ if err != nil {
+ return nil, err
+ }
+
+ filtered := make([]PartitionInfo, 0, len(partitions))
+ for i := range partitions {
+ partitions[i].Docs = counts[i]
+ if partitions[i].Docs <= 0 {
+ continue
+ }
+ filtered = append(filtered, partitions[i])
+ }
+ return filtered, nil
+}
+
+func getPartitionsByTerms(client API, indexName, fieldName string, partitionCount int, filter interface{}) ([]PartitionInfo, error) {
+ if partitionCount <= 0 {
+ return nil, fmt.Errorf("invalid parameter partition_count: %d", partitionCount)
+ }
+
+ queryDsl := util.MapStr{
+ "size": 0,
+ "aggs": util.MapStr{
+ "partitions": util.MapStr{
+ "terms": util.MapStr{
+ "field": fieldName,
+ "size": partitionCount,
+ },
+ },
+ },
+ }
+ if filter != nil {
+ queryDsl["query"] = filter
+ }
+
+ res, err := searchPartitionWithRawQueryDSL(client, indexName, queryDsl)
+ if err != nil {
+ return nil, err
+ }
+
+ var (
+ partitions []PartitionInfo
+ values []string
+ )
+ if partitionsAgg, ok := res.Aggregations["partitions"]; ok {
+ for idx, bucket := range partitionsAgg.Buckets {
+ value := fmt.Sprintf("%v", bucket["key"])
+ docCount := util.GetInt64Value(bucket["doc_count"])
+ if docCount <= 0 {
+ continue
+ }
+ values = append(values, value)
+ partitions = append(partitions, PartitionInfo{
+ Key: float64(idx),
+ Docs: docCount,
+ Label: value,
+ Values: []string{value},
+ Filter: buildExactTermPartitionFilter(value, fieldName, filter),
+ })
+ }
+ }
+
+ sumOtherDocCount, _ := jsonparser.GetInt(res.RawResult.Body, "aggregations", "partitions", "sum_other_doc_count")
+ if sumOtherDocCount > 0 {
+ partitions = append(partitions, PartitionInfo{
+ Key: float64(len(partitions)),
+ Docs: sumOtherDocCount,
+ Label: "Other terms",
+ Values: append([]string(nil), values...),
+ Filter: buildOtherTermsPartitionFilter(values, fieldName, filter),
+ Other: true,
+ })
+ }
+
+ return partitions, nil
+}
+
+func getPartitionsByHash(client API, indexName, fieldName string, partitionCount int, filter interface{}) ([]PartitionInfo, error) {
+ if partitionCount <= 0 {
+ return nil, fmt.Errorf("invalid parameter partition_count: %d", partitionCount)
+ }
+
+ partitions := make([]PartitionInfo, 0, partitionCount)
+ for idx := 0; idx < partitionCount; idx++ {
+ partitions = append(partitions, PartitionInfo{
+ Key: float64(idx),
+ Label: fmt.Sprintf("Hash %d/%d", idx+1, partitionCount),
+ Filter: buildHashPartitionFilter(idx, partitionCount, fieldName, filter),
+ })
+ }
+
+ counts, err := getHashPartitionDocCounts(client, indexName, fieldName, partitionCount, filter)
+ if err != nil {
+ return nil, err
+ }
+
+ filtered := make([]PartitionInfo, 0, len(partitions))
+ for idx := range partitions {
+ partitions[idx].Docs = counts[idx]
+ if partitions[idx].Docs <= 0 {
+ continue
+ }
+ filtered = append(filtered, partitions[idx])
+ }
+ return filtered, nil
+}
+
+func getHashPartitionDocCounts(client API, indexName, fieldName string, partitionCount int, filter interface{}) ([]int64, error) {
+ queryDsl := buildHashPartitionAggQuery(fieldName, partitionCount, filter)
+ res, err := searchPartitionWithRawQueryDSL(client, indexName, queryDsl)
+ if err != nil {
+ return nil, err
+ }
+ return extractHashPartitionDocCounts(res, partitionCount), nil
+}
+
+func buildHashPartitionAggQuery(fieldName string, partitionCount int, filter interface{}) util.MapStr {
+ fieldLiteral := buildPainlessStringLiteral(fieldName)
+ queryDsl := util.MapStr{
+ "size": 0,
+ "aggs": util.MapStr{
+ "partitions": util.MapStr{
+ "terms": util.MapStr{
+ "size": partitionCount,
+ "value_type": "long",
+ "script": util.MapStr{
+ "lang": "painless",
+ // Keep the aggregation-side hash logic identical to the partition filter so each
+ // bucket count matches the documents selected when a migration resumes that bucket.
+ "source": fmt.Sprintf("if (doc[%s].size()==0 || doc[%s].value == '') return null; return (((doc[%s].value.hashCode() %% params.partition_count) + params.partition_count) %% params.partition_count);", fieldLiteral, fieldLiteral, fieldLiteral),
+ "params": util.MapStr{
+ "partition_count": partitionCount,
+ },
+ },
+ },
+ },
+ },
+ }
+ if filter != nil {
+ queryDsl["query"] = filter
+ }
+ return queryDsl
+}
+
+func extractHashPartitionDocCounts(res *SearchResponse, partitionCount int) []int64 {
+ counts := make([]int64, partitionCount)
+ if res == nil {
+ return counts
+ }
+ partitionsAgg, ok := res.Aggregations["partitions"]
+ if !ok {
+ return counts
+ }
+ for _, bucket := range partitionsAgg.Buckets {
+ bucketKey, ok := extractHashPartitionBucketKey(bucket["key"])
+ if !ok || bucketKey < 0 || bucketKey >= partitionCount {
+ continue
+ }
+ counts[bucketKey] = util.GetInt64Value(bucket["doc_count"])
+ }
+ return counts
+}
+
+func extractHashPartitionBucketKey(key interface{}) (int, bool) {
+ switch v := key.(type) {
+ case int:
+ return v, true
+ case int64:
+ return int(v), true
+ case int32:
+ return int(v), true
+ case uint:
+ return int(v), true
+ case uint64:
+ return int(v), true
+ case float64:
+ return int(v), true
+ case float32:
+ return int(v), true
+ case string:
+ parsed, err := strconv.Atoi(v)
+ if err != nil {
+ return 0, false
+ }
+ return parsed, true
+ default:
+ return 0, false
+ }
+}
+
+func getQuantileBoundaries(client API, indexName, fieldName string, partitionCount int, min, max float64, filter interface{}) ([]float64, error) {
+ percents := buildQuantilePercents(partitionCount)
+ if len(percents) == 0 {
+ return []float64{min, max}, nil
+ }
+
+ queryDsl := util.MapStr{
+ "size": 0,
+ "aggs": util.MapStr{
+ "partition_percentiles": util.MapStr{
+ "percentiles": util.MapStr{
+ "field": fieldName,
+ "percents": percents,
+ "keyed": false,
+ },
+ },
+ },
+ }
+ if filter != nil {
+ queryDsl["query"] = filter
+ }
+
+ res, err := searchPartitionWithRawQueryDSL(client, indexName, queryDsl)
+ if err != nil {
+ return nil, err
+ }
+
+ boundaries := make([]float64, 0, len(percents)+2)
+ boundaries = append(boundaries, min)
+ _, err = jsonparser.ArrayEach(res.RawResult.Body, func(value []byte, _ jsonparser.ValueType, _ int, err error) {
+ if err != nil {
+ return
+ }
+ boundary, parseErr := jsonparser.GetFloat(value, "value")
+ if parseErr != nil || math.IsNaN(boundary) || math.IsInf(boundary, 0) {
+ return
+ }
+ boundaries = append(boundaries, boundary)
+ }, "aggregations", "partition_percentiles", "values")
+ if err != nil {
+ return nil, err
+ }
+ boundaries = append(boundaries, max)
+ boundaries = dedupeSortedBoundaries(boundaries)
+ if len(boundaries) == 1 {
+ return []float64{boundaries[0], boundaries[0]}, nil
+ }
+ return boundaries, nil
+}
+
+func buildQuantilePercents(partitionCount int) []float64 {
+ if partitionCount <= 1 {
+ return nil
+ }
+ percents := make([]float64, 0, partitionCount-1)
+ for i := 1; i < partitionCount; i++ {
+ percents = append(percents, float64(i)*100/float64(partitionCount))
+ }
+ return percents
+}
+
+func dedupeSortedBoundaries(boundaries []float64) []float64 {
+ if len(boundaries) == 0 {
+ return nil
+ }
+ sort.Float64s(boundaries)
+ result := make([]float64, 0, len(boundaries))
+ for _, boundary := range boundaries {
+ // Percentile aggregations on skewed datasets can return the same boundary more than once.
+ // Drop duplicates here so later range filters do not create zero-width partitions.
+ if len(result) == 0 || !sameBoundary(result[len(result)-1], boundary) {
+ result = append(result, boundary)
+ }
+ }
+ return result
+}
+
+func sameBoundary(left, right float64) bool {
+ return math.Abs(left-right) <= 1e-9
+}
+
+func buildQuantilePartitions(boundaries []float64, fieldName, fieldType string, filter interface{}) []PartitionInfo {
+ if len(boundaries) < 2 {
+ return nil
+ }
+
+ partitions := make([]PartitionInfo, 0, len(boundaries)-1)
+ if len(boundaries) == 2 {
+ partitions = append(partitions, PartitionInfo{
+ Key: boundaries[1],
+ Start: boundaries[0],
+ End: boundaries[1],
+ Filter: buildOpenPartitionFilter(nil, nil, fieldName, fieldType, filter),
+ })
+ return partitions
+ }
+
+ for i := 1; i < len(boundaries); i++ {
+ lower, upper := boundaries[i-1], boundaries[i]
+ if sameBoundary(lower, upper) {
+ continue
+ }
+
+ var lowerRef, upperRef *float64
+ if i > 1 {
+ lowerRef = &lower
+ }
+ if i < len(boundaries)-1 {
+ upperRef = &upper
+ }
+
+ partitions = append(partitions, PartitionInfo{
+ Key: upper,
+ Start: lower,
+ End: upper,
+ Filter: buildOpenPartitionFilter(lowerRef, upperRef, fieldName, fieldType, filter),
+ })
+ }
+ return partitions
+}
+
+func getPartitionDocCounts(client API, indexName string, partitions []PartitionInfo) ([]int64, error) {
+ queryDsl := util.MapStr{
+ "size": 0,
+ "aggs": util.MapStr{
+ "partitions": util.MapStr{
+ "filters": util.MapStr{
+ "filters": buildPartitionFiltersMap(partitions),
+ },
+ },
+ },
+ }
+
+ res, err := searchPartitionWithRawQueryDSL(client, indexName, queryDsl)
+ if err != nil {
+ return nil, err
+ }
+
+ counts := make([]int64, 0, len(partitions))
+ for i := range partitions {
+ docCount, parseErr := jsonparser.GetInt(res.RawResult.Body, "aggregations", "partitions", "buckets", strconv.Itoa(i), "doc_count")
+ if parseErr != nil {
+ return nil, parseErr
+ }
+ counts = append(counts, docCount)
+ }
+ return counts, nil
+}
+
+func buildPartitionFiltersMap(partitions []PartitionInfo) util.MapStr {
+ filters := util.MapStr{}
+ for i, partition := range partitions {
+ filters[strconv.Itoa(i)] = partition.Filter
+ }
+ return filters
+}
+
+func getMissingPartition(client API, indexName, fieldName string, filter interface{}) (*PartitionInfo, error) {
+ queryDsl := util.MapStr{
+ "size": 0,
+ "aggs": util.MapStr{
+ "missing_field": util.MapStr{
+ "filter": buildMissingFieldCondition(fieldName),
+ },
+ },
+ }
+ if filter != nil {
+ queryDsl["query"] = filter
+ }
+
+ res, err := searchPartitionWithRawQueryDSL(client, indexName, queryDsl)
+ if err != nil {
+ return nil, err
+ }
+
+ docCount, err := jsonparser.GetInt(res.RawResult.Body, "aggregations", "missing_field", "doc_count")
+ if err != nil || docCount <= 0 {
+ return nil, err
+ }
+
+ return &PartitionInfo{
+ Docs: docCount,
+ Label: "Missing values",
+ Filter: buildMissingFieldFilter(fieldName, filter),
+ Other: true,
+ }, nil
+}
+
// NOTE: we assume GetPartitions returned sorted buckets from ES, if not, we need to manually sort source & target partitions by keys
// sourcePartitions & targetPartitions must've been generated with same bucket step & offset
func MergePartitions(sourcePartitions []PartitionInfo, targetPartitions []PartitionInfo, fieldName, fieldType string, filter interface{}) []PartitionInfo {
@@ -253,7 +717,7 @@ func MergePartitions(sourcePartitions []PartitionInfo, targetPartitions []Partit
Docs: util.MaxInt64(source.Docs, target.Docs),
Other: false,
}
- partition.Filter = buildPartitionFilter(partition.Start, partition.End, fieldName, fieldType, filter)
+ partition.Filter = buildBoundedPartitionFilter(partition.Start, partition.End, fieldName, fieldType, filter)
ret = append(ret, partition)
sourceIdx += 1
targetIdx += 1
@@ -267,12 +731,14 @@ func MergePartitions(sourcePartitions []PartitionInfo, targetPartitions []Partit
return ret
}
-func buildPartitionFilter(min, max float64, fieldName, fieldType string, filter interface{}) util.MapStr {
+func buildBoundedPartitionFilter(min, max float64, fieldName, fieldType string, filter interface{}) util.MapStr {
rv := util.MapStr{
"gte": min,
"lte": max,
}
if fieldType == PartitionByDate {
+ rv["gte"] = normalizeDateRangeBoundary(min, true, true)
+ rv["lte"] = normalizeDateRangeBoundary(max, false, true)
rv["format"] = "epoch_millis"
}
must := []interface{}{
@@ -290,7 +756,217 @@ func buildPartitionFilter(min, max float64, fieldName, fieldType string, filter
"must": must,
},
}
+}
+
+func buildOpenPartitionFilter(lower, upper *float64, fieldName, fieldType string, filter interface{}) util.MapStr {
+ rv := util.MapStr{}
+ if lower != nil {
+ rv["gt"] = *lower
+ }
+ if upper != nil {
+ rv["lte"] = *upper
+ }
+ if fieldType == PartitionByDate {
+ if lower != nil {
+ rv["gt"] = normalizeDateRangeBoundary(*lower, true, false)
+ }
+ if upper != nil {
+ rv["lte"] = normalizeDateRangeBoundary(*upper, false, true)
+ }
+ rv["format"] = "epoch_millis"
+ }
+ var condition interface{}
+ if len(rv) == 0 || (len(rv) == 1 && rv["format"] != nil) {
+ condition = util.MapStr{
+ "exists": util.MapStr{
+ "field": fieldName,
+ },
+ }
+ } else {
+ condition = util.MapStr{
+ "range": util.MapStr{
+ fieldName: rv,
+ },
+ }
+ }
+ must := []interface{}{condition}
+ if filter != nil {
+ must = append(must, filter)
+ }
+ return util.MapStr{
+ "bool": util.MapStr{
+ "must": must,
+ },
+ }
+
+}
+func normalizeDateRangeBoundary(value float64, lower, inclusive bool) int64 {
+ switch {
+ case lower && inclusive:
+ return int64(math.Ceil(value))
+ case lower && !inclusive:
+ return int64(math.Floor(value))
+ case !lower && inclusive:
+ return int64(math.Floor(value))
+ default:
+ return int64(math.Ceil(value))
+ }
+}
+
+func buildExactTermPartitionFilter(value, fieldName string, filter interface{}) util.MapStr {
+ return buildMustPartitionFilter([]interface{}{
+ util.MapStr{
+ "term": util.MapStr{
+ fieldName: util.MapStr{
+ "value": value,
+ },
+ },
+ },
+ }, filter)
+}
+
+func buildOtherTermsPartitionFilter(values []string, fieldName string, filter interface{}) util.MapStr {
+ boolFilter := util.MapStr{
+ "must": []interface{}{
+ util.MapStr{
+ "exists": util.MapStr{
+ "field": fieldName,
+ },
+ },
+ },
+ }
+ if filter != nil {
+ boolFilter["must"] = append(boolFilter["must"].([]interface{}), filter)
+ }
+ if len(values) > 0 {
+ boolFilter["must_not"] = []interface{}{
+ util.MapStr{
+ "terms": util.MapStr{
+ fieldName: values,
+ },
+ },
+ }
+ }
+ return util.MapStr{
+ "bool": boolFilter,
+ }
+}
+
+func buildHashPartitionFilter(partitionID, partitionCount int, fieldName string, filter interface{}) util.MapStr {
+ fieldLiteral := buildPainlessStringLiteral(fieldName)
+ return buildMustPartitionFilter([]interface{}{
+ util.MapStr{
+ "script": util.MapStr{
+ "script": util.MapStr{
+ "lang": "painless",
+ "source": fmt.Sprintf("doc[%s].size()!=0 && doc[%s].value != '' && (((doc[%s].value.hashCode() %% params.partition_count) + params.partition_count) %% params.partition_count) == params.partition_id", fieldLiteral, fieldLiteral, fieldLiteral),
+ "params": util.MapStr{
+ "partition_count": partitionCount,
+ "partition_id": partitionID,
+ },
+ },
+ },
+ },
+ }, filter)
+}
+
+func buildPainlessStringLiteral(value string) string {
+ replacer := strings.NewReplacer(`\`, `\\`, `'`, `\'`)
+ return "'" + replacer.Replace(value) + "'"
+}
+
+func searchPartitionWithRawQueryDSL(client API, indexName string, queryDsl util.MapStr) (*SearchResponse, error) {
+ res, err := client.SearchWithRawQueryDSL(indexName, util.MustToJSONBytes(queryDsl))
+ if err != nil {
+ return nil, err
+ }
+ if err := ensurePartitionSearchResponseOK(res); err != nil {
+ return nil, err
+ }
+ return res, nil
+}
+
+func ensurePartitionSearchResponseOK(res *SearchResponse) error {
+ if res == nil {
+ return errors.New("empty search response")
+ }
+ if res.StatusCode == 0 || res.StatusCode == http.StatusOK {
+ return nil
+ }
+ if res.RawResult != nil && len(res.RawResult.Body) > 0 {
+ for _, path := range [][]string{
+ {"error", "failed_shards", "[0]", "reason", "caused_by", "reason"},
+ {"error", "failed_shards", "[0]", "reason", "reason"},
+ {"error", "root_cause", "[0]", "reason"},
+ {"error", "reason"},
+ } {
+ if msg, ok := getJSONPathString(res.RawResult.Body, path...); ok && msg != "" {
+ return errors.New(msg)
+ }
+ }
+ }
+ if msg := res.Error.Message(); msg != "" {
+ return errors.New(msg)
+ }
+ if res.RawResult != nil && len(res.RawResult.Body) > 0 {
+ return errors.New(string(res.RawResult.Body))
+ }
+ return fmt.Errorf("unexpected search status: %d", res.StatusCode)
+}
+
+func getJSONPathString(data []byte, path ...string) (string, bool) {
+ v, err := jsonparser.GetString(data, path...)
+ if err != nil {
+ return "", false
+ }
+ return v, true
+}
+
+func buildMissingFieldCondition(fieldName string) util.MapStr {
+ return util.MapStr{
+ "bool": util.MapStr{
+ "should": []interface{}{
+ util.MapStr{
+ "bool": util.MapStr{
+ "must_not": []interface{}{
+ util.MapStr{
+ "exists": util.MapStr{
+ "field": fieldName,
+ },
+ },
+ },
+ },
+ },
+ util.MapStr{
+ "term": util.MapStr{
+ fieldName: util.MapStr{
+ "value": "",
+ },
+ },
+ },
+ },
+ "minimum_should_match": 1,
+ },
+ }
+}
+
+func buildMissingFieldFilter(fieldName string, filter interface{}) util.MapStr {
+ return buildMustPartitionFilter([]interface{}{
+ buildMissingFieldCondition(fieldName),
+ }, filter)
+}
+
+func buildMustPartitionFilter(mustClauses []interface{}, filter interface{}) util.MapStr {
+ must := append([]interface{}{}, mustClauses...)
+ if filter != nil {
+ must = append(must, filter)
+ }
+ return util.MapStr{
+ "bool": util.MapStr{
+ "must": must,
+ },
+ }
}
func getBoundValues(client API, indexName string, fieldName string, filter interface{}) (*BoundValuesResult, error) {
@@ -326,7 +1002,7 @@ func getBoundValues(client API, indexName string, fieldName string, filter inter
if filter != nil {
queryDsl["query"] = filter
}
- res, err := client.SearchWithRawQueryDSL(indexName, util.MustToJSONBytes(queryDsl))
+ res, err := searchPartitionWithRawQueryDSL(client, indexName, queryDsl)
if err != nil {
return nil, err
}
diff --git a/core/elastic/partition_test.go b/core/elastic/partition_test.go
new file mode 100644
index 000000000..86c0d00ad
--- /dev/null
+++ b/core/elastic/partition_test.go
@@ -0,0 +1,321 @@
+// Copyright (C) INFINI Labs & INFINI LIMITED.
+//
+// The INFINI Framework is offered under the GNU Affero General Public License v3.0
+// and as commercial software.
+//
+// For commercial licensing, contact us at:
+// - Website: infinilabs.com
+// - Email: hello@infini.ltd
+//
+// Open Source licensed under AGPL V3:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package elastic
+
+import (
+ "net/http"
+ "reflect"
+ "strings"
+ "testing"
+
+ "infini.sh/framework/core/util"
+)
+
+func TestBuildQuantilePercents(t *testing.T) {
+ got := buildQuantilePercents(4)
+ want := []float64{25, 50, 75}
+ if !reflect.DeepEqual(got, want) {
+ t.Fatalf("unexpected percents: got %v want %v", got, want)
+ }
+}
+
+func TestBuildQuantilePartitionsCreatesOpenEdgeRanges(t *testing.T) {
+ partitions := buildQuantilePartitions([]float64{10, 20, 30}, "value", PartitionByNumber, nil)
+ if len(partitions) != 2 {
+ t.Fatalf("unexpected partition count: %d", len(partitions))
+ }
+
+ firstRange := getMustClause(t, partitions[0].Filter)["range"].(util.MapStr)["value"].(util.MapStr)
+ if _, ok := firstRange["gt"]; ok {
+ t.Fatalf("expected first partition to have no lower bound, got %v", firstRange)
+ }
+ if got := firstRange["lte"]; got != float64(20) {
+ t.Fatalf("unexpected first upper bound: %v", got)
+ }
+
+ secondRange := getMustClause(t, partitions[1].Filter)["range"].(util.MapStr)["value"].(util.MapStr)
+ if got := secondRange["gt"]; got != float64(20) {
+ t.Fatalf("unexpected second lower bound: %v", got)
+ }
+ if _, ok := secondRange["lte"]; ok {
+ t.Fatalf("expected last partition to have no upper bound, got %v", secondRange)
+ }
+}
+
+func TestBuildQuantilePartitionsSinglePartitionUsesExistsFilter(t *testing.T) {
+ partitions := buildQuantilePartitions([]float64{5, 5}, "value", PartitionByNumber, nil)
+ if len(partitions) != 1 {
+ t.Fatalf("unexpected partition count: %d", len(partitions))
+ }
+
+ clause := getMustClause(t, partitions[0].Filter)
+ exists, ok := clause["exists"].(util.MapStr)
+ if !ok {
+ t.Fatalf("expected exists clause, got %v", clause)
+ }
+ if exists["field"] != "value" {
+ t.Fatalf("unexpected exists field: %v", exists["field"])
+ }
+}
+
+func TestBuildOpenPartitionFilterPreservesDateFormat(t *testing.T) {
+ upper := 1000.0
+ filter := buildOpenPartitionFilter(nil, &upper, "ts", PartitionByDate, nil)
+ rangeFilter := getMustClause(t, filter)["range"].(util.MapStr)["ts"].(util.MapStr)
+ if got := rangeFilter["format"]; got != "epoch_millis" {
+ t.Fatalf("unexpected date format: %v", got)
+ }
+ if got := rangeFilter["lte"]; got != int64(1000) {
+ t.Fatalf("unexpected upper bound: %v", got)
+ }
+}
+
+func TestBuildOpenPartitionFilterRoundsDatePercentileBoundaries(t *testing.T) {
+ lower := 1779109187904.8455
+ upper := 1779109187999.999
+ filter := buildOpenPartitionFilter(&lower, &upper, "created_at", PartitionByDate, nil)
+ rangeFilter := getMustClause(t, filter)["range"].(util.MapStr)["created_at"].(util.MapStr)
+ if got := rangeFilter["gt"]; got != int64(1779109187904) {
+ t.Fatalf("unexpected lower bound: %v", got)
+ }
+ if got := rangeFilter["lte"]; got != int64(1779109187999) {
+ t.Fatalf("unexpected upper bound: %v", got)
+ }
+}
+
+func TestBuildBoundedPartitionFilterRoundsDateBoundaries(t *testing.T) {
+ filter := buildBoundedPartitionFilter(1779109187904.1, 1779109187999.9, "created_at", PartitionByDate, nil)
+ rangeFilter := getMustClause(t, filter)["range"].(util.MapStr)["created_at"].(util.MapStr)
+ if got := rangeFilter["gte"]; got != int64(1779109187905) {
+ t.Fatalf("unexpected lower bound: %v", got)
+ }
+ if got := rangeFilter["lte"]; got != int64(1779109187999) {
+ t.Fatalf("unexpected upper bound: %v", got)
+ }
+}
+
+func TestBuildExactTermPartitionFilter(t *testing.T) {
+ filter := buildExactTermPartitionFilter("pmid-1", "pmid.keyword", nil)
+ termFilter := getMustClause(t, filter)["term"].(util.MapStr)["pmid.keyword"].(util.MapStr)
+ if got := termFilter["value"]; got != "pmid-1" {
+ t.Fatalf("unexpected term value: %v", got)
+ }
+}
+
+func TestBuildOtherTermsPartitionFilter(t *testing.T) {
+ filter := buildOtherTermsPartitionFilter([]string{"a", "b"}, "pmid.keyword", nil)
+ boolFilter := filter["bool"].(util.MapStr)
+ mustNot := boolFilter["must_not"].([]interface{})
+ termsFilter := mustNot[0].(util.MapStr)["terms"].(util.MapStr)
+ values := termsFilter["pmid.keyword"].([]string)
+ if !reflect.DeepEqual(values, []string{"a", "b"}) {
+ t.Fatalf("unexpected excluded values: %v", values)
+ }
+}
+
+func TestBuildHashPartitionFilter(t *testing.T) {
+ filter := buildHashPartitionFilter(1, 8, "pmid.keyword", nil)
+ scriptFilter := getMustClause(t, filter)["script"].(util.MapStr)["script"].(util.MapStr)
+ if scriptFilter["lang"] != "painless" {
+ t.Fatalf("unexpected script language: %v", scriptFilter["lang"])
+ }
+ source, ok := scriptFilter["source"].(string)
+ if !ok {
+ t.Fatalf("unexpected script source: %T", scriptFilter["source"])
+ }
+ if !strings.Contains(source, "doc['pmid.keyword']") {
+ t.Fatalf("unexpected script source: %s", source)
+ }
+ if !strings.Contains(source, "value != ''") {
+ t.Fatalf("expected empty strings to be excluded from hash partition, got %s", source)
+ }
+ if strings.Contains(source, "Math.floorMod") {
+ t.Fatalf("unexpected script source: %s", source)
+ }
+ params := scriptFilter["params"].(util.MapStr)
+ if params["partition_count"] != 8 || params["partition_id"] != 1 {
+ t.Fatalf("unexpected script params: %v", params)
+ }
+ if _, ok := params["field"]; ok {
+ t.Fatalf("field should not be passed as a script param: %v", params)
+ }
+}
+
+func TestBuildHashPartitionAggQueryAppliesOuterFilter(t *testing.T) {
+ query := buildHashPartitionAggQuery("pmid.keyword", 8, util.MapStr{
+ "term": util.MapStr{
+ "env": util.MapStr{"value": "prod"},
+ },
+ })
+
+ if !reflect.DeepEqual(query["query"], util.MapStr{
+ "term": util.MapStr{
+ "env": util.MapStr{"value": "prod"},
+ },
+ }) {
+ t.Fatalf("expected outer filter to be applied at top-level query, got %v", query["query"])
+ }
+
+ termsAgg := query["aggs"].(util.MapStr)["partitions"].(util.MapStr)["terms"].(util.MapStr)
+ if got := termsAgg["size"]; got != 8 {
+ t.Fatalf("unexpected partition size: %v", got)
+ }
+ if got := termsAgg["value_type"]; got != "long" {
+ t.Fatalf("unexpected value_type: %v", got)
+ }
+ script := termsAgg["script"].(util.MapStr)
+ source, ok := script["source"].(string)
+ if !ok {
+ t.Fatalf("unexpected script source type: %T", script["source"])
+ }
+ if !strings.Contains(source, "return null") {
+ t.Fatalf("expected missing values to be skipped in hash aggregation, got %s", source)
+ }
+ if !strings.Contains(source, "value == ''") {
+ t.Fatalf("expected empty strings to be excluded in hash aggregation, got %s", source)
+ }
+ params := script["params"].(util.MapStr)
+ if got := params["partition_count"]; got != 8 {
+ t.Fatalf("unexpected partition_count: %v", got)
+ }
+}
+
+func TestExtractHashPartitionDocCountsMapsByBucketKey(t *testing.T) {
+ counts := extractHashPartitionDocCounts(&SearchResponse{
+ Aggregations: map[string]AggregationResponse{
+ "partitions": {
+ Buckets: []BucketBase{
+ {"key": float64(5), "doc_count": float64(12)},
+ {"key": "1", "doc_count": float64(7)},
+ {"key": float64(99), "doc_count": float64(3)},
+ },
+ },
+ },
+ }, 8)
+
+ expected := []int64{0, 7, 0, 0, 0, 12, 0, 0}
+ if !reflect.DeepEqual(counts, expected) {
+ t.Fatalf("unexpected hash counts: got %v want %v", counts, expected)
+ }
+}
+
+func TestBuildMissingFieldConditionIncludesEmptyString(t *testing.T) {
+ filter := buildMissingFieldCondition("pmid.keyword")
+ boolFilter, ok := filter["bool"].(util.MapStr)
+ if !ok {
+ t.Fatalf("expected bool filter, got %v", filter)
+ }
+ if got := boolFilter["minimum_should_match"]; got != 1 {
+ t.Fatalf("unexpected minimum_should_match: %v", got)
+ }
+ should, ok := boolFilter["should"].([]interface{})
+ if !ok || len(should) != 2 {
+ t.Fatalf("expected two should clauses, got %v", boolFilter["should"])
+ }
+ termFilter := should[1].(util.MapStr)["term"].(util.MapStr)["pmid.keyword"].(util.MapStr)
+ if got := termFilter["value"]; got != "" {
+ t.Fatalf("unexpected empty-string term filter: %v", termFilter)
+ }
+}
+
+func TestBuildMissingFieldFilterPreservesOuterFilter(t *testing.T) {
+ filter := buildMissingFieldFilter("pmid.keyword", util.MapStr{
+ "term": util.MapStr{
+ "env": util.MapStr{"value": "prod"},
+ },
+ })
+ boolFilter, ok := filter["bool"].(util.MapStr)
+ if !ok {
+ t.Fatalf("expected bool filter, got %v", filter)
+ }
+ must, ok := boolFilter["must"].([]interface{})
+ if !ok || len(must) != 2 {
+ t.Fatalf("expected two must clauses, got %v", boolFilter["must"])
+ }
+ innerBool, ok := must[0].(util.MapStr)["bool"].(util.MapStr)
+ if !ok {
+ t.Fatalf("expected wrapped missing bool filter, got %v", must[0])
+ }
+ if got := innerBool["minimum_should_match"]; got != 1 {
+ t.Fatalf("unexpected minimum_should_match: %v", got)
+ }
+}
+
+func TestBuildPainlessStringLiteralEscapesSingleQuote(t *testing.T) {
+ got := buildPainlessStringLiteral("foo'bar")
+ if got != `'foo\'bar'` {
+ t.Fatalf("unexpected painless string literal: %s", got)
+ }
+}
+
+func TestEnsurePartitionSearchResponseOKReturnsBackendReason(t *testing.T) {
+ err := ensurePartitionSearchResponseOK(&SearchResponse{
+ ResponseBase: ResponseBase{
+ StatusCode: http.StatusInternalServerError,
+ RawResult: &util.Result{
+ Body: []byte(`{"error":{"reason":"runtime script failure"},"status":500}`),
+ },
+ InternalError: InternalError{
+ Error: &ErrorDetail{
+ Reason: "runtime script failure",
+ },
+ Status: http.StatusInternalServerError,
+ },
+ },
+ })
+ if err == nil || err.Error() != "runtime script failure" {
+ t.Fatalf("unexpected error: %v", err)
+ }
+}
+
+func TestEnsurePartitionSearchResponseOKReturnsCausedByReason(t *testing.T) {
+ err := ensurePartitionSearchResponseOK(&SearchResponse{
+ ResponseBase: ResponseBase{
+ StatusCode: http.StatusBadRequest,
+ RawResult: &util.Result{
+ Body: []byte(`{"error":{"root_cause":[{"reason":"compile error"}],"failed_shards":[{"reason":{"reason":"compile error","caused_by":{"reason":"static method [java.lang.Math, floorMod/2] not found"}}}],"reason":"all shards failed"},"status":400}`),
+ },
+ },
+ })
+ if err == nil || err.Error() != "static method [java.lang.Math, floorMod/2] not found" {
+ t.Fatalf("unexpected error: %v", err)
+ }
+}
+
+func getMustClause(t *testing.T, filter util.MapStr) util.MapStr {
+ t.Helper()
+ boolFilter, ok := filter["bool"].(util.MapStr)
+ if !ok {
+ t.Fatalf("expected bool filter, got %v", filter)
+ }
+ must, ok := boolFilter["must"].([]interface{})
+ if !ok || len(must) == 0 {
+ t.Fatalf("expected must clauses, got %v", boolFilter["must"])
+ }
+ clause, ok := must[0].(util.MapStr)
+ if !ok {
+ t.Fatalf("expected util.MapStr clause, got %T", must[0])
+ }
+ return clause
+}
diff --git a/core/env/env.go b/core/env/env.go
index 45b61ff8e..d8603bf0c 100755
--- a/core/env/env.go
+++ b/core/env/env.go
@@ -255,7 +255,11 @@ func (env *Env) InitPaths(cfgPath string) error {
if cfgObj, err = config.LoadFile(cfgPath); err != nil {
return fmt.Errorf("error loading confiuration file: %v, %w", cfgPath, err)
}
- return cfgObj.Unpack(&env.SystemConfig)
+ if err := cfgObj.Unpack(&env.SystemConfig); err != nil {
+ return err
+ }
+ env.normalizeRelativePaths()
+ return nil
} else {
if !env.IgnoreOnConfigMissing {
return errors.Errorf("config file %v not found", cfgPath)
@@ -418,6 +422,7 @@ func (env *Env) loadEnvFromConfigFile(filename string) error {
}
env.SystemConfig = &tempCfg
+ env.normalizeRelativePaths()
//initialize node config
env.findWorkingDir()
@@ -481,6 +486,32 @@ func (env *Env) loadEnvFromConfigFile(filename string) error {
return nil
}
+func resolvePathRelativeToExecutable(p string) string {
+ p = strings.TrimSpace(p)
+ if p == "" || filepath.IsAbs(p) {
+ return p
+ }
+
+ executablePath, err := os.Executable()
+ if err != nil {
+ return p
+ }
+ // Keep relative runtime paths anchored to the installed binary rather than the caller's cwd so
+ // restarts, service managers, and migration workers all read/write the same directories.
+ return filepath.Join(filepath.Dir(executablePath), p)
+}
+
+func (env *Env) normalizeRelativePaths() {
+ if env.SystemConfig == nil {
+ return
+ }
+
+ env.SystemConfig.PathConfig.Config = resolvePathRelativeToExecutable(env.SystemConfig.PathConfig.Config)
+ env.SystemConfig.PathConfig.Data = resolvePathRelativeToExecutable(env.SystemConfig.PathConfig.Data)
+ env.SystemConfig.PathConfig.Log = resolvePathRelativeToExecutable(env.SystemConfig.PathConfig.Log)
+ env.SystemConfig.PathConfig.Plugin = resolvePathRelativeToExecutable(env.SystemConfig.PathConfig.Plugin)
+}
+
func (env *Env) GetConfigFile() string {
return env.configFile
}
@@ -550,7 +581,7 @@ func ParseConfigSection(cfg *config.Config, configKey string, configInstance int
// go-ucfg raises an error if the key does not exist, in which case
// we should return and report that the configKey does not exist.
if ucfgErr, ok := err.(ucfg.Error); ok && ucfgErr.Reason() == ucfg.ErrMissing {
- log.Debugf("config key: %s not found", configKey)
+ log.Tracef("config key: %s not found", configKey)
return false, nil
}
diff --git a/core/env/env_test.go b/core/env/env_test.go
index 23e00d752..b87292741 100644
--- a/core/env/env_test.go
+++ b/core/env/env_test.go
@@ -24,6 +24,8 @@
package env
import (
+ "os"
+ "path/filepath"
"testing"
"github.com/stretchr/testify/assert"
@@ -103,6 +105,55 @@ func TestParseConfigSection_ExistingKey_UnpackFails(t *testing.T) {
require.Error(t, err)
}
+func TestResolvePathRelativeToExecutableUsesExecutableDir(t *testing.T) {
+ executablePath, err := os.Executable()
+ require.NoError(t, err)
+
+ got := resolvePathRelativeToExecutable("data")
+
+ assert.Equal(t, filepath.Join(filepath.Dir(executablePath), "data"), got)
+}
+
+func TestNormalizeRelativePathsUsesExecutableDir(t *testing.T) {
+ executablePath, err := os.Executable()
+ require.NoError(t, err)
+
+ env := EmptyEnv()
+ env.SystemConfig.PathConfig.Config = "config"
+ env.SystemConfig.PathConfig.Data = "data"
+ env.SystemConfig.PathConfig.Log = "log"
+ env.SystemConfig.PathConfig.Plugin = "plugin"
+
+ env.normalizeRelativePaths()
+
+ executableDir := filepath.Dir(executablePath)
+ assert.Equal(t, filepath.Join(executableDir, "config"), env.SystemConfig.PathConfig.Config)
+ assert.Equal(t, filepath.Join(executableDir, "data"), env.SystemConfig.PathConfig.Data)
+ assert.Equal(t, filepath.Join(executableDir, "log"), env.SystemConfig.PathConfig.Log)
+ assert.Equal(t, filepath.Join(executableDir, "plugin"), env.SystemConfig.PathConfig.Plugin)
+}
+
+func TestInitPathsNormalizesRelativePathsFromConfig(t *testing.T) {
+ executablePath, err := os.Executable()
+ require.NoError(t, err)
+
+ cfgFile, err := os.CreateTemp("", "env-paths-*.yml")
+ require.NoError(t, err)
+ defer os.Remove(cfgFile.Name())
+
+ _, err = cfgFile.WriteString("path.data: data\npath.log: log\npath.configs: config\n")
+ require.NoError(t, err)
+ require.NoError(t, cfgFile.Close())
+
+ env := EmptyEnv()
+ require.NoError(t, env.InitPaths(cfgFile.Name()))
+
+ executableDir := filepath.Dir(executablePath)
+ assert.Equal(t, filepath.Join(executableDir, "data"), env.SystemConfig.PathConfig.Data)
+ assert.Equal(t, filepath.Join(executableDir, "log"), env.SystemConfig.PathConfig.Log)
+ assert.Equal(t, filepath.Join(executableDir, "config"), env.SystemConfig.PathConfig.Config)
+}
+
func TestParseConfigSection_KeyExistsButPrimitive_ReturnsError(t *testing.T) {
// Key exists but value is primitive (string), not an object. Child returns type error.
cfg, err := config.NewConfigFrom(map[string]interface{}{
diff --git a/core/orm/registry.go b/core/orm/registry.go
index 292b42e36..ab3b4513b 100644
--- a/core/orm/registry.go
+++ b/core/orm/registry.go
@@ -10,6 +10,11 @@ import (
var registeredSchemas = []util.KeyValue{}
+func schemaRegistrationKey(t interface{}) string {
+ pkg, typeName := util.GetTypeAndPackageName(t, true)
+ return pkg + "-" + typeName
+}
+
func MustRegisterSchemaWithIndexName(t interface{}, index string) {
err := RegisterSchemaWithIndexName(t, index)
if err != nil {
@@ -18,6 +23,17 @@ func MustRegisterSchemaWithIndexName(t interface{}, index string) {
}
func RegisterSchemaWithIndexName(t interface{}, index string) error {
+ newKey := schemaRegistrationKey(t)
+ for _, registered := range registeredSchemas {
+ if registered.Key != index {
+ continue
+ }
+ existingKey := schemaRegistrationKey(registered.Payload)
+ if existingKey == newKey {
+ return nil
+ }
+ return errors.Errorf("schema index [%s] already registered by [%s]", index, existingKey)
+ }
registeredSchemas = append(registeredSchemas, util.KeyValue{Key: index, Payload: t})
return nil
}
@@ -35,6 +51,10 @@ func InitSchema() error {
var handler ORM
+func HasHandler() bool {
+ return handler != nil
+}
+
func getHandler() ORM {
if handler == nil {
panic(errors.New("ORM handler is not registered"))
diff --git a/core/orm/registry_test.go b/core/orm/registry_test.go
new file mode 100644
index 000000000..d8327959a
--- /dev/null
+++ b/core/orm/registry_test.go
@@ -0,0 +1,62 @@
+// Copyright (C) INFINI Labs & INFINI LIMITED.
+//
+// The INFINI Framework is offered under the GNU Affero General Public License v3.0
+// and as commercial software.
+//
+// For commercial licensing, contact us at:
+// - Website: infinilabs.com
+// - Email: hello@infini.ltd
+//
+// Open Source licensed under AGPL V3:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package orm
+
+import "testing"
+
+type testSchemaAlpha struct{}
+type testSchemaBeta struct{}
+
+func TestRegisterSchemaWithIndexNameDeduplicatesSameSchema(t *testing.T) {
+ original := registeredSchemas
+ registeredSchemas = nil
+ t.Cleanup(func() {
+ registeredSchemas = original
+ })
+
+ if err := RegisterSchemaWithIndexName(testSchemaAlpha{}, "test-index"); err != nil {
+ t.Fatalf("expected first registration to succeed, got %v", err)
+ }
+ if err := RegisterSchemaWithIndexName(&testSchemaAlpha{}, "test-index"); err != nil {
+ t.Fatalf("expected duplicate registration to be ignored, got %v", err)
+ }
+ if len(registeredSchemas) != 1 {
+ t.Fatalf("expected exactly one registered schema, got %d", len(registeredSchemas))
+ }
+}
+
+func TestRegisterSchemaWithIndexNameRejectsDifferentSchemaForSameIndex(t *testing.T) {
+ original := registeredSchemas
+ registeredSchemas = nil
+ t.Cleanup(func() {
+ registeredSchemas = original
+ })
+
+ if err := RegisterSchemaWithIndexName(testSchemaAlpha{}, "test-index"); err != nil {
+ t.Fatalf("expected first registration to succeed, got %v", err)
+ }
+ if err := RegisterSchemaWithIndexName(testSchemaBeta{}, "test-index"); err == nil {
+ t.Fatal("expected conflicting registration to fail")
+ }
+}
diff --git a/core/pipeline/context.go b/core/pipeline/context.go
index 97fe18af8..efcf16ff9 100755
--- a/core/pipeline/context.go
+++ b/core/pipeline/context.go
@@ -93,21 +93,12 @@ type Context struct {
id string
steps int64
- // cancelFunc closes the Done channel of the embedded context.Context,
- // signaling processors to stop early.
- //
- // This is a cooperative mechanism: it only takes effect if the processor's process()
- // implementation explicitly checks IsCanceled() and returns when it is true.
- cancelFunc context.CancelFunc
- // True means the goroutine has been paused/suspended.
- isPaused bool
- pause sync.WaitGroup
- // Set this to true if you want to stop the pipeline, and then, pause (suspend) the goroutine.
- isQuit bool
- stateLock sync.Mutex
- // Set this to true if you want to let the goroutine exit, i.e., the kill signal.
- released bool
- // True means the goroutine already exited.
+ cancelFunc context.CancelFunc
+ isPaused bool
+ pause sync.WaitGroup
+ isQuit bool
+ stateLock sync.Mutex
+ released bool
loopReleased bool
}
@@ -116,7 +107,6 @@ func AcquireContext(config PipelineConfigV2) *Context {
ctx.ResetContext()
ctx.id = util.GetUUID()
ctx.createTime = time.Now()
- // Placeholder state; the pipeline task execution loop will overwrite this.
ctx.runningState = FINISHED
ctx.Config = config
return &ctx
@@ -299,7 +289,21 @@ func (ctx *Context) Errors() []error {
return ctx.processErrs
}
-// Pause suspends the goroutine that is running this pipeline.
+func (ctx *Context) GetResultState() RunningState {
+ ctx.stateLock.Lock()
+ defer ctx.stateLock.Unlock()
+
+ return ctx.getResultStateLocked()
+}
+
+func (ctx *Context) GetResultError() string {
+ ctx.stateLock.Lock()
+ defer ctx.stateLock.Unlock()
+
+ return formatPipelineResultError(ctx.exitErr, ctx.processErrs)
+}
+
+// Pause will pause the pipeline running loop until Resume called
func (ctx *Context) Pause() {
ctx.stateLock.Lock()
if ctx.isPaused {
@@ -313,7 +317,7 @@ func (ctx *Context) Pause() {
ctx.pause.Wait()
}
-// Resume wakes up the goroutine that was suspended by Pause.
+// Resume recovers pipeline from Pause
func (ctx *Context) Resume() {
ctx.stateLock.Lock()
if !ctx.isPaused {
@@ -378,6 +382,33 @@ func (ctx *Context) setRunningState(newState RunningState) {
}
}
+func (ctx *Context) getResultStateLocked() RunningState {
+ switch ctx.runningState {
+ case FINISHED, FAILED:
+ return ctx.runningState
+ case STOPPED:
+ // STOPPED is also used during normal shutdown. Once the run has an end time, derive the
+ // last completed result from the recorded errors so the API can distinguish manual stop
+ // from a finished or failed migration run.
+ if ctx.endTime == nil {
+ return STOPPED
+ }
+ if ctx.exitErr != nil || len(ctx.processErrs) > 0 {
+ return FAILED
+ }
+ return FINISHED
+ default:
+ return ""
+ }
+}
+
+func formatPipelineResultError(exitErr error, processErrs []error) string {
+ if exitErr == nil && len(processErrs) == 0 {
+ return ""
+ }
+ return fmt.Sprintf("exit: %v, process: %v", exitErr, processErrs)
+}
+
func (ctx *Context) pushPipelineLog() {
if global.Env().IsDebug {
log.Info("received pipeline state change, id: ", ctx.Config.Name, ", state: ", ctx.runningState)
@@ -407,8 +438,8 @@ func (ctx *Context) pushPipelineLog() {
result := util.MapStr{
"success": ctx.exitErr == nil,
}
- if ctx.exitErr != nil || len(ctx.processErrs) > 0 {
- result["error"] = fmt.Sprintf("exit: %v, process: %v", ctx.exitErr, ctx.processErrs)
+ if errMsg := formatPipelineResultError(ctx.exitErr, ctx.processErrs); errMsg != "" {
+ result["error"] = errMsg
}
payload["result"] = result
}
diff --git a/core/pipeline/context_result_test.go b/core/pipeline/context_result_test.go
new file mode 100644
index 000000000..626259d6c
--- /dev/null
+++ b/core/pipeline/context_result_test.go
@@ -0,0 +1,82 @@
+// Copyright (C) INFINI Labs & INFINI LIMITED.
+//
+// The INFINI Framework is offered under the GNU Affero General Public License v3.0
+// and as commercial software.
+//
+// For commercial licensing, contact us at:
+// - Website: infinilabs.com
+// - Email: hello@infini.ltd
+//
+// Open Source licensed under AGPL V3:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package pipeline
+
+import (
+ "errors"
+ "testing"
+)
+
+func TestGetResultStateReturnsFinishedAfterStoppedCompletedRun(t *testing.T) {
+ ctx := AcquireContext(PipelineConfigV2{})
+ ctx.Started()
+ ctx.Finished()
+ ctx.Stopped()
+
+ if got := ctx.GetResultState(); got != FINISHED {
+ t.Fatalf("expected FINISHED result state, got %q", got)
+ }
+ if got := ctx.GetResultError(); got != "" {
+ t.Fatalf("expected empty result error, got %q", got)
+ }
+}
+
+func TestGetResultStateReturnsFailedAfterStoppedFailedRun(t *testing.T) {
+ ctx := AcquireContext(PipelineConfigV2{})
+ ctx.Started()
+ ctx.Failed(errors.New("boom"))
+ ctx.Stopped()
+
+ if got := ctx.GetResultState(); got != FAILED {
+ t.Fatalf("expected FAILED result state, got %q", got)
+ }
+ if got := ctx.GetResultError(); got == "" {
+ t.Fatal("expected result error for failed run")
+ }
+}
+
+func TestGetResultStateReturnsStoppedForManualStop(t *testing.T) {
+ ctx := AcquireContext(PipelineConfigV2{})
+ ctx.Started()
+ ctx.Stopping()
+ ctx.Stopped()
+
+ if got := ctx.GetResultState(); got != STOPPED {
+ t.Fatalf("expected STOPPED result state, got %q", got)
+ }
+ if got := ctx.GetResultError(); got != "" {
+ t.Fatalf("expected empty result error, got %q", got)
+ }
+}
+
+func TestGetResultErrorIncludesProcessErrors(t *testing.T) {
+ ctx := AcquireContext(PipelineConfigV2{})
+ ctx.Started()
+ ctx.RecordError(errors.New("slice failed"))
+ ctx.Finished()
+
+ if got := ctx.GetResultError(); got == "" {
+ t.Fatal("expected process error to be surfaced")
+ }
+}
diff --git a/core/queue/consumer_config.go b/core/queue/consumer_config.go
index 8572bcf47..2f0b3f328 100644
--- a/core/queue/consumer_config.go
+++ b/core/queue/consumer_config.go
@@ -172,7 +172,7 @@ func RemoveAllConsumers(qConfig *QueueConfig) (bool, error) {
log.Error(err)
return false, err
}
- log.Debugf("success delete all consumers for queue:%v", qConfig.ID)
+ log.Tracef("success delete all consumers for queue:%v", qConfig.ID)
return true, nil
}
diff --git a/core/queue/queue_config.go b/core/queue/queue_config.go
index 8d2671869..d03b2f64c 100644
--- a/core/queue/queue_config.go
+++ b/core/queue/queue_config.go
@@ -118,7 +118,7 @@ func RegisterConfig(cfg *QueueConfig) (preExists bool, err error) {
cfg.Created = time.Now().String()
- log.Debug("init new queue config:", cfg.ID, ",", cfg.Name)
+ log.Trace("init new queue config:", cfg.ID, ",", cfg.Name)
addCfgToCache(cfg)
diff --git a/core/task/task.go b/core/task/task.go
index 8ae5d6d0e..dc2df50e3 100644
--- a/core/task/task.go
+++ b/core/task/task.go
@@ -28,9 +28,11 @@ import (
log "github.com/cihub/seelog"
"infini.sh/framework/core/errors"
"infini.sh/framework/core/global"
+ "infini.sh/framework/core/orm"
"infini.sh/framework/core/task/chrono"
"infini.sh/framework/core/util"
"runtime"
+ "strings"
"sync"
"sync/atomic"
"time"
@@ -38,6 +40,24 @@ import (
var Tasks = sync.Map{}
+func shouldSilenceStartupTaskError(msg string) bool {
+ // During startup some tasks can race slightly ahead of ORM registration. Treat that specific
+ // error as bootstrap noise so real migration/task failures remain visible in error logs.
+ return !orm.HasHandler() && strings.Contains(msg, "ORM handler is not registered")
+}
+
+func logTaskRuntimeIssue(msg string, raw interface{}) {
+ if shouldSilenceStartupTaskError(msg) {
+ log.Debug(msg)
+ return
+ }
+ if raw != nil {
+ log.Error(raw, msg)
+ return
+ }
+ log.Error(msg)
+}
+
type State string
const (
@@ -103,7 +123,7 @@ func RegisterTransientTask(group, tag string, f func(ctx context.Context) error,
case string:
v = r.(string)
}
- log.Error(r, v)
+ logTaskRuntimeIssue(v, r)
}
}
task.State = Finished
@@ -118,7 +138,7 @@ func RegisterTransientTask(group, tag string, f func(ctx context.Context) error,
task.State = Running
err := inner(innerCtx)
if err != nil {
- log.Error(err)
+ logTaskRuntimeIssue(err.Error(), err)
}
t = time.Now()
task.EndTime = &t
@@ -194,7 +214,7 @@ func RegisterScheduleTask(task ScheduleTask) (taskID string) {
case string:
v = r.(string)
}
- log.Error(v)
+ logTaskRuntimeIssue(v, nil)
}
}
task.isTaskRunning.Store(false)
diff --git a/core/task/task_test.go b/core/task/task_test.go
new file mode 100644
index 000000000..6a18dc66d
--- /dev/null
+++ b/core/task/task_test.go
@@ -0,0 +1,38 @@
+// Copyright (C) INFINI Labs & INFINI LIMITED.
+//
+// The INFINI Framework is offered under the GNU Affero General Public License v3.0
+// and as commercial software.
+//
+// For commercial licensing, contact us at:
+// - Website: infinilabs.com
+// - Email: hello@infini.ltd
+//
+// Open Source licensed under AGPL V3:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package task
+
+import "testing"
+
+func TestShouldSilenceStartupTaskErrorForMissingORMHandler(t *testing.T) {
+ if !shouldSilenceStartupTaskError("ORM handler is not registered") {
+ t.Fatal("expected missing ORM handler startup error to be silenced")
+ }
+}
+
+func TestShouldSilenceStartupTaskErrorIgnoresOtherErrors(t *testing.T) {
+ if shouldSilenceStartupTaskError("queue consumer is not registered") {
+ t.Fatal("expected unrelated startup errors to remain visible")
+ }
+}
diff --git a/docs/content.en/docs/release-notes/_index.md b/docs/content.en/docs/release-notes/_index.md
index fefcf930f..9e9e779c4 100644
--- a/docs/content.en/docs/release-notes/_index.md
+++ b/docs/content.en/docs/release-notes/_index.md
@@ -30,6 +30,7 @@ Information about release notes of INFINI Framework is provided here.
### 🐛 Bug fix
### ✈️ Improvements
+- feat(data migration): improve partitioning, bulk queue recovery, pipeline result visibility, and migration path handling #368
- chore: API Handler Registration Improvements #283
- refactor: use PathUnescape to decode query param filter #249
- chore: move entity provider to non-managed mode #250
diff --git a/modules/elastic/adapter/elasticsearch/v0.go b/modules/elastic/adapter/elasticsearch/v0.go
index 0a13b8bd8..075731ec5 100755
--- a/modules/elastic/adapter/elasticsearch/v0.go
+++ b/modules/elastic/adapter/elasticsearch/v0.go
@@ -491,6 +491,20 @@ func (c *ESAPIV0) Get(indexName, docType, id string) (*elastic.GetResponse, erro
return esResp, err
}
+ if resp.StatusCode >= 400 {
+ if esResp.Error != nil {
+ errType := esResp.Error.Type
+ errReason := esResp.Error.Message()
+ if errType != "" && errReason != "" {
+ return esResp, errors.Errorf("status:%d, type:%s, reason:%s", resp.StatusCode, errType, errReason)
+ }
+ if errReason != "" {
+ return esResp, errors.Errorf("status:%d, reason:%s", resp.StatusCode, errReason)
+ }
+ }
+ return esResp, errors.Errorf("status:%d", resp.StatusCode)
+ }
+
return esResp, nil
}
@@ -1263,7 +1277,7 @@ func (s *ESAPIV0) UpdateMapping(indexName string, docType string, mappings []byt
panic(err)
}
if resp.StatusCode != http.StatusOK {
- return nil, fmt.Errorf("%s", resp.Body)
+ return nil, fmt.Errorf("%s", string(resp.Body))
}
return resp.Body, nil
@@ -1416,7 +1430,7 @@ func (c *ESAPIV0) GetTemplate(templateName string) (map[string]interface{}, erro
}
if resp.StatusCode != 200 {
- return nil, fmt.Errorf("%s", resp.Body)
+ return nil, fmt.Errorf("%s", string(resp.Body))
}
data := map[string]interface{}{}
@@ -1725,7 +1739,7 @@ func (c *ESAPIV0) Alias(body []byte) error {
return err
}
if res.StatusCode != http.StatusOK {
- return fmt.Errorf("%s", res.Body)
+ return fmt.Errorf("%s", string(res.Body))
}
return nil
}
@@ -1880,7 +1894,7 @@ func (c *ESAPIV0) UpdateClusterSettings(body []byte) error {
}
if resp.StatusCode != http.StatusOK {
- return fmt.Errorf("%s", resp.Body)
+ return fmt.Errorf("%s", string(resp.Body))
}
return nil
@@ -1893,7 +1907,7 @@ func (c *ESAPIV0) GetRemoteInfo() ([]byte, error) {
return nil, err
}
if resp.StatusCode != http.StatusOK {
- return nil, fmt.Errorf("%s", resp.Body)
+ return nil, fmt.Errorf("%s", string(resp.Body))
}
return resp.Body, nil
@@ -2003,7 +2017,7 @@ func (c *ESAPIV0) Flush(indexName string) ([]byte, error) {
return nil, err
}
if resp.StatusCode != http.StatusOK {
- return nil, fmt.Errorf("%s", resp.Body)
+ return nil, fmt.Errorf("%s", string(resp.Body))
}
return resp.Body, nil
}
@@ -2034,7 +2048,7 @@ func (c *ESAPIV0) ClusterAllocationExplain(ctx context.Context, body []byte, par
return nil, err
}
if resp.StatusCode != 200 {
- return nil, fmt.Errorf("%s", resp.Body)
+ return nil, fmt.Errorf("%s", string(resp.Body))
}
return resp.Body, nil
}
@@ -2046,7 +2060,7 @@ func (c *ESAPIV0) CatAllocation(ctx context.Context) ([]elastic.CatAllocationRes
return nil, err
}
if resp.StatusCode != 200 {
- return nil, fmt.Errorf("%s", resp.Body)
+ return nil, fmt.Errorf("%s", string(resp.Body))
}
data := []elastic.CatAllocationResponse{}
err = json.Unmarshal(resp.Body, &data)
diff --git a/modules/elastic/adapter/ver.go b/modules/elastic/adapter/ver.go
index 1e3120cb5..a02119217 100755
--- a/modules/elastic/adapter/ver.go
+++ b/modules/elastic/adapter/ver.go
@@ -169,6 +169,14 @@ func RequestTimeout(ctx *elastic.APIContext, method, url string, body []byte, me
func GetClusterUUID(clusterID string) (string, error) {
meta := elastic.GetMetadata(clusterID)
+ if meta == nil {
+ if cfg := elastic.GetConfigNoPanic(clusterID); cfg != nil {
+ if cfg.ClusterUUID != "" {
+ return cfg.ClusterUUID, nil
+ }
+ meta = elastic.GetOrInitMetadata(cfg)
+ }
+ }
if meta == nil {
return "", fmt.Errorf("metadata can not be mepty")
}
diff --git a/modules/elastic/adapter/ver_test.go b/modules/elastic/adapter/ver_test.go
new file mode 100644
index 000000000..4775efdae
--- /dev/null
+++ b/modules/elastic/adapter/ver_test.go
@@ -0,0 +1,53 @@
+// Copyright (C) INFINI Labs & INFINI LIMITED.
+//
+// The INFINI Framework is offered under the GNU Affero General Public License v3.0
+// and as commercial software.
+//
+// For commercial licensing, contact us at:
+// - Website: infinilabs.com
+// - Email: hello@infini.ltd
+//
+// Open Source licensed under AGPL V3:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package adapter
+
+import (
+ "testing"
+
+ "infini.sh/framework/core/elastic"
+ "infini.sh/framework/core/orm"
+)
+
+func TestGetClusterUUIDFallsBackToConfigWhenMetadataMissing(t *testing.T) {
+ cfg := elastic.ElasticsearchConfig{
+ ORMObjectBase: orm.ORMObjectBase{ID: "test-cluster-uuid-fallback"},
+ Name: "test-cluster-uuid-fallback",
+ ClusterUUID: "cluster-uuid-fallback",
+ }
+
+ t.Cleanup(func() {
+ elastic.RemoveInstance(cfg.ID)
+ })
+
+ elastic.UpdateConfig(cfg)
+
+ clusterUUID, err := GetClusterUUID(cfg.ID)
+ if err != nil {
+ t.Fatalf("expected cluster uuid from config fallback, got error: %v", err)
+ }
+ if clusterUUID != cfg.ClusterUUID {
+ t.Fatalf("expected cluster uuid %q, got %q", cfg.ClusterUUID, clusterUUID)
+ }
+}
diff --git a/modules/elastic/common/config.go b/modules/elastic/common/config.go
index eeadb3dd8..4c27a29d4 100644
--- a/modules/elastic/common/config.go
+++ b/modules/elastic/common/config.go
@@ -87,7 +87,9 @@ func InitClientWithConfig(esConfig elastic.ElasticsearchConfig) (client elastic.
ver string
)
if esConfig.Version == "" || esConfig.Version == "auto" {
- verInfo, err := adapter.ClusterVersion(elastic.GetOrInitMetadata(&esConfig))
+ probeMeta := &elastic.ElasticsearchMetadata{Config: &esConfig}
+ probeMeta.Init(true)
+ verInfo, err := adapter.ClusterVersion(probeMeta)
if err != nil {
return nil, err
}
@@ -219,6 +221,9 @@ func InitElasticInstance(esConfig elastic.ElasticsearchConfig) (elastic.API, err
log.Warn("elasticsearch ", esConfig.Name, " is not enabled")
return nil, nil
}
+ originMeta := elastic.GetMetadata(esConfig.ID)
+ initHealth := getInitialMetadataHealth(originMeta)
+
client, err := InitClientWithConfig(esConfig)
if err != nil {
log.Error("elasticsearch ", esConfig.Name, err)
@@ -226,12 +231,6 @@ func InitElasticInstance(esConfig elastic.ElasticsearchConfig) (elastic.API, err
}
elastic.RegisterInstance(esConfig, client)
- originMeta := elastic.GetMetadata(esConfig.ID)
- initHealth := true
- if originMeta != nil {
- initHealth = originMeta.IsAvailable()
- }
-
v := elastic.InitMetadata(&esConfig, initHealth)
if v.Health == nil && originMeta != nil {
v.Health = originMeta.Health
@@ -240,6 +239,13 @@ func InitElasticInstance(esConfig elastic.ElasticsearchConfig) (elastic.API, err
return client, err
}
+func getInitialMetadataHealth(originMeta *elastic.ElasticsearchMetadata) bool {
+ if originMeta == nil {
+ return true
+ }
+ return originMeta.IsAvailable()
+}
+
func GetBasicAuth(esConfig *elastic.ElasticsearchConfig) (basicAuth *model.BasicAuth, err error) {
if esConfig.BasicAuth != nil && esConfig.BasicAuth.Username != "" {
basicAuth = esConfig.BasicAuth
diff --git a/modules/elastic/common/config_test.go b/modules/elastic/common/config_test.go
new file mode 100644
index 000000000..78ff78347
--- /dev/null
+++ b/modules/elastic/common/config_test.go
@@ -0,0 +1,50 @@
+// Copyright (C) INFINI Labs & INFINI LIMITED.
+//
+// The INFINI Framework is offered under the GNU Affero General Public License v3.0
+// and as commercial software.
+//
+// For commercial licensing, contact us at:
+// - Website: infinilabs.com
+// - Email: hello@infini.ltd
+//
+// Open Source licensed under AGPL V3:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package common
+
+import (
+ "testing"
+
+ "infini.sh/framework/core/elastic"
+)
+
+func TestGetInitialMetadataHealthDefaultsToAvailableForNewCluster(t *testing.T) {
+ if !getInitialMetadataHealth(nil) {
+ t.Fatal("expected new cluster metadata to start as available before first health check")
+ }
+}
+
+func TestGetInitialMetadataHealthKeepsExistingAvailability(t *testing.T) {
+ meta := &elastic.ElasticsearchMetadata{Config: &elastic.ElasticsearchConfig{Enabled: true}}
+ meta.Init(false)
+
+ if getInitialMetadataHealth(meta) {
+ t.Fatal("expected existing unavailable metadata to remain unavailable")
+ }
+
+ meta.Init(true)
+ if !getInitialMetadataHealth(meta) {
+ t.Fatal("expected existing available metadata to remain available")
+ }
+}
diff --git a/modules/elastic/metadata.go b/modules/elastic/metadata.go
index 60c07bdd9..1c6cff6ce 100644
--- a/modules/elastic/metadata.go
+++ b/modules/elastic/metadata.go
@@ -697,10 +697,13 @@ func (module *ElasticModule) updateNodeInfo(meta *elastic.ElasticsearchMetadata,
log.Trace("update node info")
if !force && !meta.IsAvailable() {
+ stateChanged := false
if !force {
- setNodeUnknown(meta.Config.ID)
+ stateChanged = setNodeUnknown(meta.Config.ID)
+ }
+ if stateChanged || rate.GetRateLimiter("metadata_node_info_skip", meta.Config.ID, 1, 1, 10*time.Minute).Allow() {
+ log.Debugf("elasticsearch [%v] is not available, skip update node info", meta.Config.Name)
}
- log.Debugf("elasticsearch [%v] is not available, skip update node info", meta.Config.Name)
return
}
@@ -808,17 +811,17 @@ func (module *ElasticModule) updateNodeInfo(meta *elastic.ElasticsearchMetadata,
var saveNodeMetadataMutex = sync.Mutex{}
var nodeAlreadyUnknown = map[string]bool{}
-func setNodeUnknown(clusterID string) {
+func setNodeUnknown(clusterID string) bool {
kv.DeleteKey(elastic.KVElasticNodeMetadata, []byte(clusterID))
meta := elastic.GetMetadata(clusterID)
if meta == nil {
- return
+ return false
}
if meta.Config.Source != elastic.ElasticsearchConfigSourceElasticsearch {
- return
+ return false
}
if v, ok := nodeAlreadyUnknown[clusterID]; ok && v {
- return
+ return false
}
queueConfig := queue.GetOrInitConfig(elastic.QueueElasticIndexState)
if queueConfig.Labels == nil {
@@ -846,6 +849,7 @@ func setNodeUnknown(clusterID string) {
}
nodeAlreadyUnknown[clusterID] = true
+ return true
}
func saveNodeMetadata(nodes map[string]elastic.NodesInfo, clusterID string) error {
esConfig := elastic.GetConfig(clusterID)
diff --git a/modules/elastic/module.go b/modules/elastic/module.go
index 52264b0cc..702763f16 100755
--- a/modules/elastic/module.go
+++ b/modules/elastic/module.go
@@ -112,10 +112,26 @@ func loadFileBasedElasticConfig() []elastic.ElasticsearchConfig {
return configs
}
+func lookupSystemElasticsearchID() (string, bool) {
+ value := global.Lookup(elastic.GlobalSystemElasticsearchID)
+ systemID, ok := value.(string)
+ if !ok || systemID == "" {
+ return "", false
+ }
+ return systemID, true
+}
+
func loadESBasedElasticConfig() []elastic.ElasticsearchConfig {
configs := []elastic.ElasticsearchConfig{}
+ systemID, ok := lookupSystemElasticsearchID()
+ if !ok {
+ // Console-managed elasticsearch configs live in the system cluster. During startup or
+ // migration bootstrap that cluster may not exist yet, so treat it as "no remote configs"
+ // instead of failing module initialization.
+ return configs
+ }
query := elastic.SearchRequest{From: 0, Size: 1000} //TODO handle clusters beyond 1000
- esClient := elastic.GetClient(global.MustLookupString(elastic.GlobalSystemElasticsearchID))
+ esClient := elastic.GetClient(systemID)
result, err := esClient.Search(orm.GetIndexName(elastic.ElasticsearchConfig{}), &query)
if err != nil {
log.Error(err)
@@ -394,20 +410,31 @@ func InitSchema() {
var ormInited bool
func (module *ElasticModule) Start() error {
+ systemID, hasSystemCluster := lookupSystemElasticsearchID()
if moduleConfig.ORMConfig.Enabled {
- client := elastic.GetClient(global.MustLookupString(elastic.GlobalSystemElasticsearchID))
- handler := ElasticORM{Client: client, Config: moduleConfig.ORMConfig}
- orm.Register("elastic", &handler)
+ if !hasSystemCluster {
+ // Allow the module to start before the system cluster is registered so bootstrap and
+ // migration flows can finish wiring the cluster first, then re-enable ORM-backed features.
+ log.Warn("skip elastic ORM initialization, system cluster is not available")
+ } else {
+ client := elastic.GetClient(systemID)
+ handler := ElasticORM{Client: client, Config: moduleConfig.ORMConfig}
+ orm.Register("elastic", &handler)
+ }
}
if moduleConfig.StoreConfig.Enabled {
- client := elastic.GetClient(global.MustLookupString(elastic.GlobalSystemElasticsearchID))
- module.storeHandler = &ElasticStore{Client: client, Config: moduleConfig.StoreConfig}
- kv.Register("elastic", module.storeHandler)
+ if !hasSystemCluster {
+ log.Warn("skip elastic store initialization, system cluster is not available")
+ } else {
+ client := elastic.GetClient(systemID)
+ module.storeHandler = &ElasticStore{Client: client, Config: moduleConfig.StoreConfig}
+ kv.Register("elastic", module.storeHandler)
+ }
}
- if moduleConfig.ORMConfig.Enabled {
+ if moduleConfig.ORMConfig.Enabled && hasSystemCluster {
if !ormInited {
//init template
InitTemplate(false)
@@ -418,8 +445,12 @@ func (module *ElasticModule) Start() error {
}
if moduleConfig.RemoteConfigEnabled {
- m := loadESBasedElasticConfig()
- initElasticInstances(m, elastic.ElasticsearchConfigSourceElasticsearch)
+ if !hasSystemCluster {
+ log.Warn("skip remote elastic config loading, system cluster is not available")
+ } else {
+ m := loadESBasedElasticConfig()
+ initElasticInstances(m, elastic.ElasticsearchConfigSourceElasticsearch)
+ }
}
if module.storeHandler != nil {
@@ -693,7 +724,20 @@ func (module *ElasticModule) refreshAllClusterMetadata() {
log.Trace("update elasticsearch's metadata:", v, ok)
if ok {
- module.updateNodeInfo(v, false, v.Config.Discovery.Enabled)
+ cfg := elastic.GetConfigNoPanic(v.Config.ID)
+ if cfg == nil {
+ // Metadata can outlive the config during remote-config reloads or migration cleanup.
+ // Drop it here so later workers do not keep probing a cluster that was already removed.
+ log.Debugf("elasticsearch metadata [%v] has no active config, removing stale metadata", v.Config.ID)
+ elastic.RemoveInstance(v.Config.ID)
+ elastic.RemoveHostsByClusterID(v.Config.ID)
+ return true
+ }
+ v.Config = cfg
+ if !cfg.Enabled || (cfg.MetadataConfigs != nil && !cfg.MetadataConfigs.MetadataRefresh.Enabled) {
+ return true
+ }
+ module.updateNodeInfo(v, false, cfg.Discovery.Enabled)
}
return true
})
@@ -706,6 +750,19 @@ func (module *ElasticModule) refreshAllClusterAlias(force bool) {
}
v, ok := value.(*elastic.ElasticsearchMetadata)
if ok {
+ cfg := elastic.GetConfigNoPanic(v.Config.ID)
+ if cfg == nil {
+ // Keep alias refresh in sync with metadata refresh: once the config is gone, clear any
+ // cached hosts/metadata so the next initialization starts from the active config set only.
+ log.Debugf("elasticsearch metadata [%v] has no active config, removing stale metadata", v.Config.ID)
+ elastic.RemoveInstance(v.Config.ID)
+ elastic.RemoveHostsByClusterID(v.Config.ID)
+ return true
+ }
+ v.Config = cfg
+ if !cfg.Enabled || (cfg.MetadataConfigs != nil && !cfg.MetadataConfigs.MetadataRefresh.Enabled) {
+ return true
+ }
updateAliases(v, force)
}
return true
diff --git a/modules/elastic/module_test.go b/modules/elastic/module_test.go
index 3accbf667..4594315b7 100644
--- a/modules/elastic/module_test.go
+++ b/modules/elastic/module_test.go
@@ -1,48 +1,60 @@
-// Copyright (C) INFINI Labs & INFINI LIMITED.
-//
-// The INFINI Framework is offered under the GNU Affero General Public License v3.0
-// and as commercial software.
-//
-// For commercial licensing, contact us at:
-// - Website: infinilabs.com
-// - Email: hello@infini.ltd
-//
-// Open Source licensed under AGPL V3:
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see .
-
package elastic
import (
- "fmt"
- "github.com/buger/jsonparser"
- "infini.sh/framework/core/util"
"testing"
+
+ coreElastic "infini.sh/framework/core/elastic"
+ "infini.sh/framework/core/global"
)
-func TestV7GetClusterStates(t *testing.T) {
- str := "{ \"_nodes\": { \"total\": 1, \"successful\": 1, \"failed\": 0 }, \"cluster_name\": \"es-v700\", \"cluster_uuid\": \"7NtDffC3RzGChhoOmgySig\", \"timestamp\": 1629611578327, \"status\": \"green\", \"indices\": { \"count\": 0, \"shards\": {}, \"docs\": { \"count\": 0, \"deleted\": 0 }, \"store\": { \"size_in_bytes\": 0 }, \"fielddata\": { \"memory_size_in_bytes\": 0, \"evictions\": 0 }, \"query_cache\": { \"memory_size_in_bytes\": 0, \"total_count\": 0, \"hit_count\": 0, \"miss_count\": 0, \"cache_size\": 0, \"cache_count\": 0, \"evictions\": 0 }, \"completion\": { \"size_in_bytes\": 0 }, \"segments\": { \"count\": 0, \"memory_in_bytes\": 0, \"terms_memory_in_bytes\": 0, \"stored_fields_memory_in_bytes\": 0, \"term_vectors_memory_in_bytes\": 0, \"norms_memory_in_bytes\": 0, \"points_memory_in_bytes\": 0, \"doc_values_memory_in_bytes\": 0, \"index_writer_memory_in_bytes\": 0, \"version_map_memory_in_bytes\": 0, \"fixed_bit_set_memory_in_bytes\": 0, \"max_unsafe_auto_id_timestamp\": -9223372036854776000, \"file_sizes\": {} } }, \"nodes\": { \"count\": { \"total\": 1, \"data\": 1, \"coordinating_only\": 0, \"master\": 1, \"ingest\": 1 }, \"versions\": [ \"7.0.0\" ], \"os\": { \"available_processors\": 24, \"allocated_processors\": 24, \"names\": [ { \"name\": \"Windows 10\", \"count\": 1 } ], \"pretty_names\": [ { \"pretty_name\": \"Windows 10\", \"count\": 1 } ], \"mem\": { \"total_in_bytes\": 137121308672, \"free_in_bytes\": 114813546496, \"used_in_bytes\": 22307762176, \"free_percent\": 84, \"used_percent\": 16 } }, \"process\": { \"cpu\": { \"percent\": 0 }, \"open_file_descriptors\": { \"min\": -1, \"max\": -1, \"avg\": 0 } }, \"jvm\": { \"max_uptime_in_millis\": 2021226, \"versions\": [ { \"version\": \"9.0.1.3\", \"vm_name\": \"OpenJDK 64-Bit Server VM\", \"vm_version\": \"9.0.1.3+11\", \"vm_vendor\": \"Azul Systems, Inc.\", \"bundled_jdk\": false, \"using_bundled_jdk\": null, \"count\": 1 } ], \"mem\": { \"heap_used_in_bytes\": 277003800, \"heap_max_in_bytes\": 1037959168 }, \"threads\": 66 }, \"fs\": { \"total_in_bytes\": 6000527532032, \"free_in_bytes\": 3111816585216, \"available_in_bytes\": 3111816585216 }, \"plugins\": [], \"network_types\": { \"transport_types\": { \"netty4\": 1 }, \"http_types\": { \"netty4\": 1 } }, \"discovery_types\": { \"zen\": 1 } } }"
+func TestLoadESBasedElasticConfigSkipsWhenSystemClusterUnavailable(t *testing.T) {
+ previous := global.Lookup(coreElastic.GlobalSystemElasticsearchID)
+ defer global.Register(coreElastic.GlobalSystemElasticsearchID, previous)
+
+ global.Register(coreElastic.GlobalSystemElasticsearchID, "")
+
+ configs := loadESBasedElasticConfig()
+ if len(configs) != 0 {
+ t.Fatalf("expected no remote configs when system cluster id is unavailable, got %d", len(configs))
+ }
+}
+
+func TestLookupSystemElasticsearchIDRejectsInvalidValues(t *testing.T) {
+ previous := global.Lookup(coreElastic.GlobalSystemElasticsearchID)
+ defer global.Register(coreElastic.GlobalSystemElasticsearchID, previous)
+
+ global.Register(coreElastic.GlobalSystemElasticsearchID, 123)
+
+ systemID, ok := lookupSystemElasticsearchID()
+ if ok {
+ t.Fatalf("expected invalid system cluster value to be rejected, got %q", systemID)
+ }
+}
+
+func TestElasticModuleStartSkipsSystemClusterDependentInitBeforeSetup(t *testing.T) {
+ previousSystemID := global.Lookup(coreElastic.GlobalSystemElasticsearchID)
+ defer global.Register(coreElastic.GlobalSystemElasticsearchID, previousSystemID)
+
+ previousModuleConfig := moduleConfig
+ defer func() {
+ moduleConfig = previousModuleConfig
+ }()
+
+ previousOrmInited := ormInited
+ defer func() {
+ ormInited = previousOrmInited
+ }()
+
+ global.Register(coreElastic.GlobalSystemElasticsearchID, "")
+
+ moduleConfig = getDefaultConfig()
+ moduleConfig.ORMConfig.Enabled = true
+ moduleConfig.StoreConfig.Enabled = true
+ moduleConfig.RemoteConfigEnabled = true
+ ormInited = false
- d1, err := jsonparser.GetInt(util.UnsafeStringToBytes(str), "indices", "segments", "max_unsafe_auto_id_timestamp")
- fmt.Println("xv:", d1, err)
- if err != nil {
- d, err := jsonparser.Set(util.UnsafeStringToBytes(str), []byte("-1"), "indices", "segments", "max_unsafe_auto_id_timestamp")
- if err == nil {
- str = util.UnsafeBytesToString(d)
- }
+ module := &ElasticModule{}
+ if err := module.Start(); err != nil {
+ t.Fatalf("expected elastic module start to succeed before setup, got %v", err)
}
- d1, err = jsonparser.GetInt(util.UnsafeStringToBytes(str), "indices", "segments", "max_unsafe_auto_id_timestamp")
- fmt.Println("xv:", d1, err)
- //xv,err:=jsonparser.GetInt([]byte(str),"indices.segments.max_unsafe_auto_id_timestamp")
- //fmt.Println("xv:",xv,err)
}
diff --git a/modules/elastic/schema.go b/modules/elastic/schema.go
index a437af49f..80119c1c1 100755
--- a/modules/elastic/schema.go
+++ b/modules/elastic/schema.go
@@ -35,6 +35,7 @@ import (
"sync"
"unicode"
+ "infini.sh/framework/core/elastic"
"infini.sh/framework/core/global"
"github.com/buger/jsonparser"
@@ -124,6 +125,62 @@ func parseAnnotation(mapping []util.Annotation) string {
return json
}
+func ensureDefaultStringDynamicTemplates(mappingData map[string]interface{}) {
+ if mappingData == nil {
+ return
+ }
+ if _, ok := mappingData["dynamic_templates"]; ok {
+ return
+ }
+ mappingData["dynamic_templates"] = []interface{}{
+ util.MapStr{
+ "strings": util.MapStr{
+ "match_mapping_type": "string",
+ "mapping": util.MapStr{
+ "type": "keyword",
+ "ignore_above": 256,
+ },
+ },
+ },
+ }
+}
+
+func containsKeyDeep(value interface{}, targetKey string) bool {
+ switch v := value.(type) {
+ case map[string]interface{}:
+ for key, nested := range v {
+ if key == targetKey {
+ return true
+ }
+ if containsKeyDeep(nested, targetKey) {
+ return true
+ }
+ }
+ case []interface{}:
+ for _, nested := range v {
+ if containsKeyDeep(nested, targetKey) {
+ return true
+ }
+ }
+ }
+ return false
+}
+
+func shouldRefreshExistingTemplate(client elastic.API, templateName string, mappingData map[string]interface{}) bool {
+ if mappingData == nil {
+ return false
+ }
+ if _, ok := mappingData["dynamic_templates"]; !ok {
+ return false
+ }
+ template, err := client.GetTemplate(templateName)
+ if err != nil {
+ log.Warnf("failed to inspect existing template [%s]: %v", templateName, err)
+ return false
+ }
+ return !containsKeyDeep(template, "dynamic_templates")
+}
+
func initIndexName(t interface{}, indexName string) string {
pkg, ojbType := util.GetTypeAndPackageName(t, true)
key := fmt.Sprintf("%s-%s", pkg, ojbType)
@@ -181,6 +238,7 @@ func (handler *ElasticORM) RegisterSchemaWithName(t interface{}, indexName strin
}
return err
}
+ ensureDefaultStringDynamicTemplates(mappingData)
template, err := handler.Client.BuildTemplate(indexName+"*", nil, mappingData)
if err != nil {
if handler.Config.PanicOnInitSchemaError {
@@ -228,6 +286,44 @@ func (handler *ElasticORM) RegisterSchemaWithName(t interface{}, indexName strin
//init index
_ = handler.tryCreateInitIndex(t, indexName)
+ } else if handler.Config.BuildTemplateForObject {
+ jsonFormat := `{ %s }`
+ mapping := getIndexMapping(t)
+ js := parseAnnotation(mapping)
+ json := fmt.Sprintf(jsonFormat, quoteJson(js))
+
+ var mappingData map[string]interface{}
+ err = util.FromJSONBytes([]byte(json), &mappingData)
+ if err != nil {
+ if handler.Config.PanicOnInitSchemaError {
+ panic(err)
+ }
+ return err
+ }
+ ensureDefaultStringDynamicTemplates(mappingData)
+ if shouldRefreshExistingTemplate(handler.Client, indexTemplate, mappingData) {
+ template, err := handler.Client.BuildTemplate(indexName+"*", nil, mappingData)
+ if err != nil {
+ if handler.Config.PanicOnInitSchemaError {
+ panic(err)
+ }
+ return err
+ }
+ data, err := handler.Client.PutTemplate(indexTemplate, template)
+ if err != nil {
+ if handler.Config.PanicOnInitSchemaError {
+ panic(err)
+ }
+ return err
+ }
+ x, _, _, _ := jsonparser.Get(data, "error")
+ if x != nil {
+ log.Errorf("error on update template: %v, %v", indexName, string(x))
+ if handler.Config.PanicOnInitSchemaError {
+ panic(string(data))
+ }
+ }
+ }
}
return err
}
diff --git a/modules/elastic/schema_test.go b/modules/elastic/schema_test.go
index 518da84a1..ae2c39330 100644
--- a/modules/elastic/schema_test.go
+++ b/modules/elastic/schema_test.go
@@ -91,3 +91,19 @@ func TestQuoteWithUnderscore(t *testing.T) {
json := quoteJson(js)
assert.Equal(t, json, `{ "properties":{ "id": { "type": "keyword" },"created": { "type": "date" },"updated": { "type": "date" },"_system": { "type": "object" },"name": { "type": "keyword" } } }`)
}
+
+func TestEnsureDefaultStringDynamicTemplates(t *testing.T) {
+ mapping := map[string]interface{}{
+ "properties": map[string]interface{}{
+ "timestamp": map[string]interface{}{
+ "type": "date",
+ },
+ },
+ }
+
+ ensureDefaultStringDynamicTemplates(mapping)
+
+ templates, ok := mapping["dynamic_templates"].([]interface{})
+ assert.Equal(t, ok, true)
+ assert.Equal(t, len(templates), 1)
+}
diff --git a/modules/pipeline/model.go b/modules/pipeline/model.go
index 10c4b52f5..cd386f58b 100644
--- a/modules/pipeline/model.go
+++ b/modules/pipeline/model.go
@@ -31,11 +31,18 @@ import (
)
type PipelineTaskStatus struct {
- State pipeline.RunningState `json:"state"`
- CreateTime time.Time `json:"create_time"`
- StartTime *time.Time `json:"start_time"`
- EndTime *time.Time `json:"end_time"`
- Context util.MapStr `json:"context"`
- Config *pipeline.PipelineConfigV2 `json:"config"`
- Processors []map[string]interface{} `json:"processor"`
+ State pipeline.RunningState `json:"state"`
+ LastRunState pipeline.RunningState `json:"last_run_state,omitempty"`
+ CreateTime time.Time `json:"create_time"`
+ StartTime *time.Time `json:"start_time"`
+ EndTime *time.Time `json:"end_time"`
+ Context util.MapStr `json:"context"`
+ Result *PipelineResult `json:"result,omitempty"`
+ Config *pipeline.PipelineConfigV2 `json:"config"`
+ Processors []map[string]interface{} `json:"processor"`
+}
+
+type PipelineResult struct {
+ Success bool `json:"success"`
+ Error string `json:"error,omitempty"`
}
diff --git a/modules/pipeline/module.go b/modules/pipeline/module.go
index 575a0da85..854aa9899 100755
--- a/modules/pipeline/module.go
+++ b/modules/pipeline/module.go
@@ -168,6 +168,13 @@ func (module *PipeModule) stopTask(taskID string) (exists bool) {
// deleteTask will clean all in-memory states and release the pipeline context
func (module *PipeModule) deleteTask(taskID string) {
+ if ctx, ok := module.contexts.Load(taskID); ok {
+ // Wait for the worker loop to observe cancellation before dropping the context so a
+ // re-created migration task does not race with the previous loop's final cleanup.
+ if v1, ok := ctx.(*pipeline.Context); ok && !v1.IsLoopReleased() {
+ module.stopAndWaitForRelease([]string{taskID}, time.Minute)
+ }
+ }
module.pipelines.Delete(taskID)
module.configs.Delete(taskID)
module.releaseContext(taskID)
diff --git a/modules/pipeline/pipeline_test.go b/modules/pipeline/pipeline_test.go
new file mode 100644
index 000000000..2ee932a0b
--- /dev/null
+++ b/modules/pipeline/pipeline_test.go
@@ -0,0 +1,73 @@
+// Copyright (C) INFINI Labs & INFINI LIMITED.
+//
+// The INFINI Framework is offered under the GNU Affero General Public License v3.0
+// and as commercial software.
+//
+// For commercial licensing, contact us at:
+// - Website: infinilabs.com
+// - Email: hello@infini.ltd
+//
+// Open Source licensed under AGPL V3:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package pipeline
+
+import (
+ "testing"
+ "time"
+
+ corepipeline "infini.sh/framework/core/pipeline"
+)
+
+func TestDeleteTaskWaitsForLoopRelease(t *testing.T) {
+ module := &PipeModule{}
+ ctx := corepipeline.AcquireContext(corepipeline.PipelineConfigV2{})
+
+ module.contexts.Store("task-1", ctx)
+ module.configs.Store("task-1", corepipeline.PipelineConfigV2{Name: "task-1"})
+ module.pipelines.Store("task-1", struct{}{})
+
+ released := make(chan struct{})
+ go func() {
+ for !ctx.IsCanceled() {
+ time.Sleep(time.Millisecond)
+ }
+ time.Sleep(50 * time.Millisecond)
+ ctx.SetLoopReleased()
+ close(released)
+ }()
+
+ start := time.Now()
+ module.deleteTask("task-1")
+ elapsed := time.Since(start)
+
+ select {
+ case <-released:
+ default:
+ t.Fatal("expected deleteTask to wait for loop release")
+ }
+
+ if elapsed < 50*time.Millisecond {
+ t.Fatalf("expected deleteTask to wait for loop release, returned after %v", elapsed)
+ }
+ if _, ok := module.contexts.Load("task-1"); ok {
+ t.Fatal("expected context to be deleted")
+ }
+ if _, ok := module.configs.Load("task-1"); ok {
+ t.Fatal("expected config to be deleted")
+ }
+ if _, ok := module.pipelines.Load("task-1"); ok {
+ t.Fatal("expected pipeline to be deleted")
+ }
+}
diff --git a/modules/pipeline/tasks.go b/modules/pipeline/tasks.go
index eb7bc6662..88306eade 100644
--- a/modules/pipeline/tasks.go
+++ b/modules/pipeline/tasks.go
@@ -80,11 +80,20 @@ func (module *PipeModule) getPipelineTaskStatus(id string, config string, proces
return nil
}
ret := &PipelineTaskStatus{
- State: c1.GetRunningState(),
- CreateTime: c1.GetCreateTime(),
- StartTime: c1.GetStartTime(),
- EndTime: c1.GetEndTime(),
- Context: c1.CloneData(),
+ State: c1.GetRunningState(),
+ // Keep the current runtime state and the last completed result separate so migration
+ // callers can tell a stopped task from a run that already finished or failed.
+ LastRunState: c1.GetResultState(),
+ CreateTime: c1.GetCreateTime(),
+ StartTime: c1.GetStartTime(),
+ EndTime: c1.GetEndTime(),
+ Context: c1.CloneData(),
+ }
+ if ret.LastRunState == pipeline.FINISHED || ret.LastRunState == pipeline.FAILED {
+ ret.Result = &PipelineResult{
+ Success: c1.GetResultError() == "",
+ Error: c1.GetResultError(),
+ }
}
if config != "false" {
v1, ok := module.configs.Load(id)
diff --git a/modules/queue/disk_queue/cleanup.go b/modules/queue/disk_queue/cleanup.go
index 1fed12942..3456d881f 100644
--- a/modules/queue/disk_queue/cleanup.go
+++ b/modules/queue/disk_queue/cleanup.go
@@ -92,7 +92,7 @@ func (module *DiskQueue) deleteUnusedFiles(queueID string, fileNum int64) {
fileStartToDelete := fileNum - module.cfg.Retention.MaxNumOfLocalFiles
if fileStartToDelete <= 0 || consumers <= 0 || eSegmentNum < 0 {
- log.Debugf("queue: %v, no consumers or consumer/s3 already ahead of this file, %v, %v, %v", queueID, fileStartToDelete, consumers, eSegmentNum)
+ log.Tracef("queue: %v, no consumers or consumer/s3 already ahead of this file, %v, %v, %v", queueID, fileStartToDelete, consumers, eSegmentNum)
return
}
diff --git a/modules/queue/disk_queue/compress.go b/modules/queue/disk_queue/compress.go
index b03fdccfb..63404a21e 100644
--- a/modules/queue/disk_queue/compress.go
+++ b/modules/queue/disk_queue/compress.go
@@ -104,7 +104,7 @@ func (module *DiskQueue) compressFiles(queueID string, fileNum int64) {
//skip compress file
if fileStartToCompress <= 0 || (module.cfg.SkipZeroConsumers && consumers <= 0) || fileStartToCompress <= lastCompressedFileNum {
- log.Debugf("skip compress %v", queueID)
+ log.Tracef("skip compress %v", queueID)
return
}
diff --git a/modules/queue/disk_queue/consumer.go b/modules/queue/disk_queue/consumer.go
index c18bbd83e..c27746770 100644
--- a/modules/queue/disk_queue/consumer.go
+++ b/modules/queue/disk_queue/consumer.go
@@ -68,6 +68,32 @@ type Consumer struct {
fileLoadCompleted bool
}
+func (d *Consumer) parkOnEmptyTail(fileName string) error {
+ if d.readFile != nil {
+ if err := d.readFile.Close(); err != nil && !util.ContainStr(err.Error(), "already") {
+ return err
+ }
+ }
+ d.readFile = nil
+ d.reader = nil
+ d.fileName = fileName
+ d.lastFileSize = 0
+ d.maxBytesPerFileRead = 0
+ d.fileLoadCompleted = false
+ return nil
+}
+
+func (d *Consumer) waitingForTailFile() bool {
+ // A parked consumer has already advanced to the current write segment, but there is no tail
+ // file content yet. Treat this as a normal catch-up state so the next write can resume from
+ // the live tail instead of rescanning an older corrupt or already-consumed segment.
+ return d.diskQueue != nil &&
+ d.readFile == nil &&
+ d.reader == nil &&
+ d.segment == d.diskQueue.writeSegmentNum &&
+ d.readPos == 0
+}
+
func (c *Consumer) getFileSize() int64 {
var err error
readFile, err := os.OpenFile(c.fileName, os.O_RDONLY, 0600)
@@ -144,6 +170,22 @@ READ_MSG:
// check reader
if d.reader == nil {
+ if d.waitingForTailFile() {
+ if d.diskQueue.writePos > 0 || util.FileExists(d.fileName) {
+ err = d.ResetOffset(d.segment, d.readPos)
+ if err != nil {
+ if strings.Contains(err.Error(), "not found") {
+ return messages, false, nil
+ }
+ return messages, false, err
+ }
+ goto READ_MSG
+ }
+ if len(messages) == 0 && d.cCfg.EOFRetryDelayInMs > 0 {
+ time.Sleep(time.Duration(d.cCfg.EOFRetryDelayInMs) * time.Millisecond)
+ }
+ return messages, false, nil
+ }
return messages, false, errors.New("reader is nil")
}
//read message size
@@ -236,7 +278,7 @@ READ_MSG:
}
return messages, false, err
}
- log.Debugf("queue:%v, offset:%v,%v, msgSize:%v", d.queue, d.segment, d.readPos, msgSize)
+ log.Tracef("queue:%v, offset:%v,%v, msgSize:%v", d.queue, d.segment, d.readPos, msgSize)
if int32(msgSize) < d.mCfg.MinMsgSize || int32(msgSize) > d.mCfg.MaxMsgSize {
//current have changes, reload file with new position
newFileSize := d.getFileSize()
@@ -274,8 +316,19 @@ READ_MSG:
//can't read ahead before current write file
if nextSegment >= d.diskQueue.writeSegmentNum {
log.Debugf("need to skip to next file, but next file not exists, current write segment:%v, current read segment:%v", d.diskQueue.writeSegmentNum, d.segment)
- d.diskQueue.skipToNextRWFile(false)
+ err = d.diskQueue.skipToNextRWFile(false)
+ if err != nil {
+ return messages, false, err
+ }
d.diskQueue.needSync = true
+ err = d.ResetOffset(d.diskQueue.writeSegmentNum, 0)
+ if err != nil {
+ if strings.Contains(err.Error(), "not found") {
+ return messages, false, nil
+ }
+ return messages, false, err
+ }
+ ctx.UpdateNextOffset(d.segment, d.readPos)
} else {
//let's continue move to next file
nextSegment++
@@ -332,9 +385,9 @@ READ_MSG:
//still working on the same file
if d.diskQueue.writeSegmentNum == d.segment {
time.Sleep(100 * time.Millisecond) // Prevent catching up too quickly.
- log.Debugf("invalid message size detected. this might be due to a dirty read as the file was being written while open. reloading segment: %d", d.segment)
+ log.Tracef("invalid message size detected. this might be due to a dirty read as the file was being written while open. reloading segment: %d", d.segment)
} else {
- log.Debugf("invalid message size detected. this might be due to a partial file load. reloading segment: %d", d.segment)
+ log.Tracef("invalid message size detected. this might be due to a partial file load. reloading segment: %d", d.segment)
}
d.readPos = previousPos
@@ -509,6 +562,11 @@ func (d *Consumer) ResetOffset(segment, readPos int64) error {
if !exists {
//double check, but next file exists
if !util.FileExists(fileName) {
+ if segment == d.diskQueue.writeSegmentNum && readPos == 0 && d.diskQueue.writePos == 0 {
+ // The consumer has caught up to an empty tail segment. Park there and let FetchMessages
+ // wait for the producer to materialize the next file instead of treating the tail as corrupt.
+ return d.parkOnEmptyTail(fileName)
+ }
if d.mCfg.AutoSkipCorruptFile {
nextSegment := d.segment + 1
if nextSegment > d.diskQueue.writeSegmentNum {
@@ -518,7 +576,7 @@ func (d *Consumer) ResetOffset(segment, readPos int64) error {
d.qCfg.Name, d.queue, d.cCfg.Key(), d.segment, d.readPos, fileName)
RETRY_NEXT_FILE:
// there are segments in the middle
- if nextSegment < d.diskQueue.writeSegmentNum {
+ if nextSegment <= d.diskQueue.writeSegmentNum {
fileName, exists, next_file_exists = SmartGetFileName(d.mCfg, d.queue, nextSegment)
if exists || util.FileExists(fileName) {
log.Debugf("retry skip to next file: %v, exists", fileName)
@@ -532,6 +590,14 @@ func (d *Consumer) ResetOffset(segment, readPos int64) error {
goto RETRY_NEXT_FILE
}
} else {
+ if d.diskQueue.writePos == 0 {
+ d.segment = d.diskQueue.writeSegmentNum
+ d.readPos = 0
+ d.diskQueue.UpdateSegmentConsumerInReading(d.ID, d.segment)
+ // After skipping every missing intermediate segment, the safest recovery point is
+ // the current writer tail. Park on it so the consumer resumes from fresh data only.
+ return d.parkOnEmptyTail(GetFileName(d.queue, d.segment))
+ }
return errors.New(fileName + " not found, next segment greater than current write segment")
}
} else {
diff --git a/modules/queue/disk_queue/diskqueue.go b/modules/queue/disk_queue/diskqueue.go
index d29a8f1da..e3f8c9278 100644
--- a/modules/queue/disk_queue/diskqueue.go
+++ b/modules/queue/disk_queue/diskqueue.go
@@ -69,6 +69,8 @@ import (
"infini.sh/framework/core/util/zstd"
)
+const bytesPerMiB = 1024 * 1024
+
// providing a filesystem backed FIFO queue
type DiskBasedQueue struct {
sync.RWMutex
@@ -118,6 +120,8 @@ type DiskBasedQueue struct {
// NewDiskQueue instantiates a new instance of DiskBasedQueue, retrieving metadata
// from the filesystem and starting the read ahead goroutine
func NewDiskQueueByConfig(name, dataPath string, cfg *DiskQueueConfig) *DiskBasedQueue {
+ normalizeDiskQueueConfig(cfg)
+
d := DiskBasedQueue{
name: name,
dataPath: dataPath,
@@ -177,7 +181,8 @@ func (d *DiskBasedQueue) ReadChan() <-chan []byte {
// Put writes a []byte to the queue
func (d *DiskBasedQueue) Put(data []byte) WriteResponse {
- ctx, cancel := context.WithTimeout(context.Background(), time.Duration(d.cfg.WriteTimeoutInMS)*time.Millisecond)
+ writeTimeout := d.getWriteTimeout(len(data))
+ ctx, cancel := context.WithTimeout(context.Background(), writeTimeout)
defer cancel()
size := int64(len(data))
@@ -232,7 +237,7 @@ func (d *DiskBasedQueue) Put(data []byte) WriteResponse {
switch res.Error {
case context.DeadlineExceeded:
// Handle timeout error specifically
- res.Error = fmt.Errorf("operation timed out: %w", res.Error)
+ res.Error = fmt.Errorf("operation timed out after %s waiting for disk queue writer availability: %w", writeTimeout, res.Error)
case context.Canceled:
// Handle cancellation error specifically
res.Error = fmt.Errorf("operation was canceled: %w", res.Error)
@@ -244,6 +249,28 @@ func (d *DiskBasedQueue) Put(data []byte) WriteResponse {
}
}
+func (d *DiskBasedQueue) getWriteTimeout(payloadSize int) time.Duration {
+ timeoutInMS := defaultWriteTimeoutInMS
+ if d != nil && d.cfg != nil && d.cfg.WriteTimeoutInMS > 0 {
+ timeoutInMS = d.cfg.WriteTimeoutInMS
+ }
+
+ if payloadSize > 0 {
+ payloadMiB := int64((payloadSize + bytesPerMiB - 1) / bytesPerMiB)
+ timeoutInMS += payloadMiB * adaptiveWriteTimeoutPerPayloadMiBInMS
+ }
+
+ if d != nil && len(d.writeChan) > 0 {
+ timeoutInMS += int64(len(d.writeChan)) * adaptiveWriteTimeoutPerQueuedWriteInMS
+ }
+
+ if timeoutInMS > maxAdaptiveWriteTimeoutInMS {
+ timeoutInMS = maxAdaptiveWriteTimeoutInMS
+ }
+
+ return time.Duration(timeoutInMS) * time.Millisecond
+}
+
// Close cleans up the queue and persists metadata
func (d *DiskBasedQueue) Close() error {
err := d.exit(false)
diff --git a/modules/queue/disk_queue/diskqueue_test.go b/modules/queue/disk_queue/diskqueue_test.go
new file mode 100644
index 000000000..7a0798fa1
--- /dev/null
+++ b/modules/queue/disk_queue/diskqueue_test.go
@@ -0,0 +1,205 @@
+// Copyright (C) INFINI Labs & INFINI LIMITED.
+//
+// The INFINI Framework is offered under the GNU Affero General Public License v3.0
+// and as commercial software.
+//
+// For commercial licensing, contact us at:
+// - Website: infinilabs.com
+// - Email: hello@infini.ltd
+//
+// Open Source licensed under AGPL V3:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package queue
+
+import (
+ "encoding/binary"
+ "os"
+ "path/filepath"
+ "testing"
+ "time"
+
+ . "infini.sh/framework/core/env"
+ "infini.sh/framework/core/global"
+ corequeue "infini.sh/framework/core/queue"
+)
+
+func TestGetWriteTimeoutIncludesPayloadAndBacklog(t *testing.T) {
+ dq := &DiskBasedQueue{
+ cfg: &DiskQueueConfig{WriteTimeoutInMS: defaultWriteTimeoutInMS},
+ writeChan: make(chan []byte, defaultWriteChanBuffer),
+ }
+
+ dq.writeChan <- []byte("a")
+ dq.writeChan <- []byte("b")
+
+ timeout := dq.getWriteTimeout(3 * bytesPerMiB)
+
+ expected := time.Duration(defaultWriteTimeoutInMS+3*adaptiveWriteTimeoutPerPayloadMiBInMS+2*adaptiveWriteTimeoutPerQueuedWriteInMS) * time.Millisecond
+ if timeout != expected {
+ t.Fatalf("unexpected write timeout: got %s want %s", timeout, expected)
+ }
+}
+
+func TestGetWriteTimeoutCapsAtMaximum(t *testing.T) {
+ dq := &DiskBasedQueue{
+ cfg: &DiskQueueConfig{WriteTimeoutInMS: defaultWriteTimeoutInMS},
+ writeChan: make(chan []byte, defaultWriteChanBuffer),
+ }
+
+ for i := 0; i < cap(dq.writeChan); i++ {
+ dq.writeChan <- []byte("x")
+ }
+
+ timeout := dq.getWriteTimeout(64 * bytesPerMiB)
+ expected := time.Duration(maxAdaptiveWriteTimeoutInMS) * time.Millisecond
+ if timeout != expected {
+ t.Fatalf("unexpected capped timeout: got %s want %s", timeout, expected)
+ }
+}
+
+func TestResetOffsetSkipsMissingSegmentsUpToCurrentWriteSegment(t *testing.T) {
+ env1 := EmptyEnv()
+ env1.SystemConfig.PathConfig.Data = t.TempDir()
+ global.RegisterEnv(env1)
+
+ queueName := "reset-offset-skip"
+ data := []byte("ok")
+ fileName := GetFileName(queueName, 2)
+ if err := os.MkdirAll(filepath.Dir(fileName), 0o755); err != nil {
+ t.Fatalf("failed to create queue dir: %v", err)
+ }
+ file, err := os.Create(fileName)
+ if err != nil {
+ t.Fatalf("failed to create segment file: %v", err)
+ }
+ if err := binary.Write(file, binary.BigEndian, int32(len(data))); err != nil {
+ t.Fatalf("failed to write message size: %v", err)
+ }
+ if _, err := file.Write(data); err != nil {
+ t.Fatalf("failed to write message body: %v", err)
+ }
+ if err := file.Close(); err != nil {
+ t.Fatalf("failed to close segment file: %v", err)
+ }
+
+ dq := &DiskBasedQueue{
+ name: queueName,
+ cfg: &DiskQueueConfig{AutoSkipCorruptFile: true, MinMsgSize: 1, MaxMsgSize: 1024},
+ writeSegmentNum: 2,
+ writePos: int64(4 + len(data)),
+ }
+ consumer := &Consumer{
+ ID: "consumer-reset",
+ diskQueue: dq,
+ mCfg: dq.cfg,
+ qCfg: &corequeue.QueueConfig{Name: queueName},
+ cCfg: &corequeue.ConsumerConfig{},
+ queue: queueName,
+ }
+
+ if err := consumer.ResetOffset(1, 0); err != nil {
+ t.Fatalf("expected reset offset to skip to current write segment, got %v", err)
+ }
+ if consumer.segment != 2 {
+ t.Fatalf("expected consumer to move to segment 2, got %d", consumer.segment)
+ }
+ if consumer.reader == nil {
+ t.Fatalf("expected consumer reader to be initialized for segment 2")
+ }
+}
+
+func TestFetchMessagesRecoversToEmptyTailWithoutRescanningCorruptFile(t *testing.T) {
+ env1 := EmptyEnv()
+ env1.SystemConfig.PathConfig.Data = t.TempDir()
+ global.RegisterEnv(env1)
+
+ queueName := "fetch-empty-tail"
+ corruptFile := GetFileName(queueName, 1)
+ if err := os.MkdirAll(filepath.Dir(corruptFile), 0o755); err != nil {
+ t.Fatalf("failed to create queue dir: %v", err)
+ }
+ if err := os.WriteFile(corruptFile, []byte{0x7f, 0xff, 0xff, 0xff}, 0o644); err != nil {
+ t.Fatalf("failed to write corrupt segment: %v", err)
+ }
+
+ dq := &DiskBasedQueue{
+ name: queueName,
+ cfg: &DiskQueueConfig{AutoSkipCorruptFile: true, MinMsgSize: 1, MaxMsgSize: 1024},
+ writeSegmentNum: 3,
+ writePos: 0,
+ }
+ consumer := &Consumer{
+ ID: "consumer-fetch",
+ diskQueue: dq,
+ mCfg: dq.cfg,
+ qCfg: &corequeue.QueueConfig{Name: queueName},
+ cCfg: &corequeue.ConsumerConfig{},
+ queue: queueName,
+ }
+
+ if err := consumer.ResetOffset(1, 0); err != nil {
+ t.Fatalf("failed to initialize consumer: %v", err)
+ }
+
+ ctx := &corequeue.Context{}
+ messages, timeout, err := consumer.FetchMessages(ctx, 1)
+ if err != nil {
+ t.Fatalf("expected corruption recovery without error, got %v", err)
+ }
+ if timeout {
+ t.Fatalf("did not expect timeout during corruption recovery")
+ }
+ if len(messages) != 0 {
+ t.Fatalf("expected no messages during recovery, got %d", len(messages))
+ }
+ if consumer.segment != dq.writeSegmentNum {
+ t.Fatalf("expected consumer to park on new tail segment %d, got %d", dq.writeSegmentNum, consumer.segment)
+ }
+ if ctx.NextOffset.Segment != dq.writeSegmentNum || ctx.NextOffset.Position != 0 {
+ t.Fatalf("expected next offset to advance to new tail, got %v", ctx.NextOffset)
+ }
+
+ payload := []byte("hello")
+ tailFile := GetFileName(queueName, dq.writeSegmentNum)
+ file, err := os.Create(tailFile)
+ if err != nil {
+ t.Fatalf("failed to create new tail segment: %v", err)
+ }
+ if err := binary.Write(file, binary.BigEndian, int32(len(payload))); err != nil {
+ t.Fatalf("failed to write tail message size: %v", err)
+ }
+ if _, err := file.Write(payload); err != nil {
+ t.Fatalf("failed to write tail message body: %v", err)
+ }
+ if err := file.Close(); err != nil {
+ t.Fatalf("failed to close tail segment: %v", err)
+ }
+ dq.writePos = int64(4 + len(payload))
+
+ ctx = &corequeue.Context{}
+ messages, timeout, err = consumer.FetchMessages(ctx, 1)
+ if err != nil {
+ t.Fatalf("expected consumer to resume reading on new tail, got %v", err)
+ }
+ if timeout {
+ t.Fatalf("did not expect timeout when new tail data exists")
+ }
+ if len(messages) != 1 {
+ t.Fatalf("expected exactly one message, got %d", len(messages))
+ }
+ if string(messages[0].Data) != "hello" {
+ t.Fatalf("expected payload %q, got %q", "hello", string(messages[0].Data))
+ }
+}
diff --git a/modules/queue/disk_queue/module.go b/modules/queue/disk_queue/module.go
index d7b1d8ee9..2383a68b4 100644
--- a/modules/queue/disk_queue/module.go
+++ b/modules/queue/disk_queue/module.go
@@ -124,8 +124,33 @@ type CompressConfig struct {
Level int `config:"level"`
}
+const (
+ defaultWriteTimeoutInMS int64 = 60 * 1000
+ defaultWriteChanBuffer = 16
+ minRecommendedWriteTimeoutInMS int64 = 15 * 1000
+ maxAdaptiveWriteTimeoutInMS int64 = 5 * 60 * 1000
+ adaptiveWriteTimeoutPerQueuedWriteInMS int64 = 3 * 1000
+ adaptiveWriteTimeoutPerPayloadMiBInMS int64 = 5 * 1000
+)
+
var preventRead bool
+func normalizeDiskQueueConfig(cfg *DiskQueueConfig) {
+ if cfg == nil {
+ return
+ }
+
+ if cfg.WriteTimeoutInMS <= 0 {
+ cfg.WriteTimeoutInMS = defaultWriteTimeoutInMS
+ } else if cfg.WriteTimeoutInMS < minRecommendedWriteTimeoutInMS {
+ log.Warnf("disk_queue write timeout may be too small on slow disks: %dms", cfg.WriteTimeoutInMS)
+ }
+
+ if cfg.WriteChanBuffer <= 0 {
+ cfg.WriteChanBuffer = defaultWriteChanBuffer
+ }
+}
+
func checkCapacity(cfg *DiskQueueConfig) error {
if cfg.CheckDiskCapacityRetryDelayInMs <= 0 {
@@ -233,20 +258,20 @@ func (module *DiskQueue) Setup() {
MinMsgSize: 1,
MaxMsgSize: 104857600, //100MB
MaxBytesPerFile: 100 * 1024 * 1024, //100MB
- WriteTimeoutInMS: 1000, //1s
- CheckDiskCapacityRetryDelayInMs: 10 * 000, //10s
+ WriteTimeoutInMS: defaultWriteTimeoutInMS,
+ CheckDiskCapacityRetryDelayInMs: 10 * 000, //10s
EOFRetryDelayInMs: 500,
SyncEveryRecords: 1000,
SyncTimeoutInMS: 1000,
NotifyChanBuffer: 100,
ReadChanBuffer: 0,
- WriteChanBuffer: 0,
+ WriteChanBuffer: defaultWriteChanBuffer,
WarningFreeBytes: 10 * 1024 * 1024 * 1024,
ReservedFreeBytes: 5 * 1024 * 1024 * 1024,
PrepareFilesToRead: true,
Compress: DiskCompress{
IdleThreshold: 3,
- DeleteAfterCompress: false,
+ DeleteAfterCompress: true,
NumOfFilesDecompressAhead: 3,
Message: CompressConfig{
Enabled: false,
@@ -262,6 +287,8 @@ func (module *DiskQueue) Setup() {
panic(err)
}
+ normalizeDiskQueueConfig(module.cfg)
+
if !module.cfg.Enabled {
return
}
diff --git a/modules/queue/disk_queue/module_test.go b/modules/queue/disk_queue/module_test.go
new file mode 100644
index 000000000..4b51e7524
--- /dev/null
+++ b/modules/queue/disk_queue/module_test.go
@@ -0,0 +1,72 @@
+// Copyright (C) INFINI Labs & INFINI LIMITED.
+//
+// The INFINI Framework is offered under the GNU Affero General Public License v3.0
+// and as commercial software.
+//
+// For commercial licensing, contact us at:
+// - Website: infinilabs.com
+// - Email: hello@infini.ltd
+//
+// Open Source licensed under AGPL V3:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package queue
+
+import (
+ "testing"
+
+ . "infini.sh/framework/core/env"
+ "infini.sh/framework/core/global"
+)
+
+func TestNormalizeDiskQueueConfigAppliesRobustWriteDefaults(t *testing.T) {
+ cfg := &DiskQueueConfig{}
+
+ normalizeDiskQueueConfig(cfg)
+
+ if cfg.WriteTimeoutInMS != defaultWriteTimeoutInMS {
+ t.Fatalf("unexpected write timeout: %d", cfg.WriteTimeoutInMS)
+ }
+ if cfg.WriteChanBuffer != defaultWriteChanBuffer {
+ t.Fatalf("unexpected write chan buffer: %d", cfg.WriteChanBuffer)
+ }
+}
+
+func TestNormalizeDiskQueueConfigKeepsExplicitWriteSettings(t *testing.T) {
+ cfg := &DiskQueueConfig{
+ WriteTimeoutInMS: 45 * 1000,
+ WriteChanBuffer: 64,
+ }
+
+ normalizeDiskQueueConfig(cfg)
+
+ if cfg.WriteTimeoutInMS != 45*1000 {
+ t.Fatalf("write timeout should be preserved, got %d", cfg.WriteTimeoutInMS)
+ }
+ if cfg.WriteChanBuffer != 64 {
+ t.Fatalf("write chan buffer should be preserved, got %d", cfg.WriteChanBuffer)
+ }
+}
+
+func TestSetupDefaultsDeleteAfterCompress(t *testing.T) {
+ env1 := EmptyEnv()
+ global.RegisterEnv(env1)
+
+ module := DiskQueue{}
+ module.Setup()
+
+ if !module.cfg.Compress.DeleteAfterCompress {
+ t.Fatalf("delete_after_compress should default to true")
+ }
+}
diff --git a/plugins/elastic/bulk_indexing/bulk_indexing.go b/plugins/elastic/bulk_indexing/bulk_indexing.go
index f7c1f118a..d6054e5f5 100755
--- a/plugins/elastic/bulk_indexing/bulk_indexing.go
+++ b/plugins/elastic/bulk_indexing/bulk_indexing.go
@@ -78,6 +78,8 @@ type BulkIndexingProcessor struct {
bulkBufferPool *elastic.BulkBufferPool
}
+var queueOwners sync.Map
+
type Config struct {
NumOfSlices int `config:"num_of_slices"`
Slices []int `config:"slices"`
@@ -233,7 +235,17 @@ func (processor *BulkIndexingProcessor) Process(c *pipeline.Context) error {
log.Error("error in bulk indexing processor,", v)
}
}
- log.Debug("exit bulk indexing processor")
+ if processor.bulkStats != nil {
+ log.Debugf(
+ "exit bulk indexing processor, success=%d, invalid=%d, failure=%d, error_msgs=%d",
+ processor.bulkStats.Summary.Success.Count,
+ processor.bulkStats.Summary.Invalid.Count,
+ processor.bulkStats.Summary.Failure.Count,
+ len(processor.bulkStats.ErrorMsgs),
+ )
+ } else {
+ log.Debug("exit bulk indexing processor")
+ }
}()
//handle updates
@@ -268,6 +280,7 @@ func (processor *BulkIndexingProcessor) Process(c *pipeline.Context) error {
processor.wg.Done()
}()
+ lastDispatch := time.Now()
for {
if global.ShuttingDown() {
@@ -299,11 +312,12 @@ func (processor *BulkIndexingProcessor) Process(c *pipeline.Context) error {
}
//if have depth and not in in flight
if !processor.config.SkipEmptyQueue || queue.HasLag(v) {
- _, ok := processor.inFlightQueueConfigs.Load(v.ID)
+ ok := processor.hasInFlightQueue(v.ID)
if !ok {
if global.Env().IsDebug {
log.Tracef("detecting new queue: %v", v.Name)
}
+ lastDispatch = time.Now()
processor.HandleQueueConfig(v, c)
}
} else {
@@ -315,12 +329,33 @@ func (processor *BulkIndexingProcessor) Process(c *pipeline.Context) error {
if processor.config.DetectIntervalInMs > 0 {
time.Sleep(time.Millisecond * time.Duration(processor.config.DetectIntervalInMs))
}
+ // Let migration-style pipelines exit once queue discovery stays idle for a full
+ // interval instead of polling forever after all active queues drain.
+ if shouldQuitActiveQueueDetection(
+ lastDispatch,
+ time.Duration(processor.config.IdleTimeoutInSecond)*time.Second,
+ time.Duration(processor.config.DetectIntervalInMs)*time.Millisecond,
+ util.MapLength(&processor.inFlightQueueConfigs),
+ ) {
+ if processor.bulkStats != nil {
+ log.Debugf(
+ "active queue detector idle exit, success=%d, invalid=%d, failure=%d, inflight=%d",
+ processor.bulkStats.Summary.Success.Count,
+ processor.bulkStats.Summary.Invalid.Count,
+ processor.bulkStats.Summary.Failure.Count,
+ util.MapLength(&processor.inFlightQueueConfigs),
+ )
+ }
+ return
+ }
}
}(c)
}
} else {
cfgs := queue.GetConfigBySelector(&processor.config.Selector)
- log.Debugf("filter queue by:%v, num of queues:%v", processor.config.Selector.ToString(), len(cfgs))
+ if global.Env().IsDebug {
+ log.Tracef("filter queue by:%v, num of queues:%v", processor.config.Selector.ToString(), len(cfgs))
+ }
for _, v := range cfgs {
if global.Env().IsDebug {
log.Tracef("checking queue: %v", v)
@@ -334,14 +369,35 @@ func (processor *BulkIndexingProcessor) Process(c *pipeline.Context) error {
return nil
}
+func shouldQuitActiveQueueDetection(lastDispatch time.Time, idleDuration time.Duration, detectInterval time.Duration, inflight int) bool {
+ if idleDuration <= 0 {
+ return false
+ }
+ if detectInterval < 0 {
+ detectInterval = 0
+ }
+ return inflight == 0 && time.Since(lastDispatch) >= idleDuration+detectInterval
+}
+
const queueHandleSingleton = "queue_handler_singleton"
func (processor *BulkIndexingProcessor) HandleQueueConfig(v *queue.QueueConfig, parentContext *pipeline.Context) {
+ // Prevent duplicate local workers for the same queue before competing for the
+ // distributed lease; this keeps one process from starting overlapping consumers.
+ if !processor.acquireQueueOwner(v.ID) {
+ if rate.GetRateLimiter("bulk_queue_owner", v.ID, 1, 1, 30*time.Second).Allow() {
+ log.Debugf("skip queue:[%v], already owned by another local bulk processor", v.ID)
+ }
+ return
+ }
+ defer processor.releaseQueueOwnerIfIdle(v.ID)
//TODO, add config to enable/disable singleton, may have performance issue
ok, _ := locker.Hold(queueHandleSingleton, v.ID, global.Env().SystemConfig.NodeConfig.ID, 60*time.Second, true)
if !ok {
- log.Debugf("failed to hold lock for queue:[%v], already hold by somewhere", v.ID)
+ if rate.GetRateLimiter("bulk_queue_lock", v.ID, 1, 1, 30*time.Second).Allow() {
+ log.Debugf("failed to hold lock for queue:[%v], already hold by somewhere", v.ID)
+ }
return
}
@@ -433,6 +489,10 @@ func (processor *BulkIndexingProcessor) HandleQueueConfig(v *queue.QueueConfig,
}
func (processor *BulkIndexingProcessor) NewBulkWorker(parentContext *pipeline.Context, qConfig *queue.QueueConfig, preferedHost string) {
+ if global.Env().IsDebug {
+ // current time for monitoring and log
+ log.Debugf("starting bulk worker for queue: %v, host: %v at time: %v", qConfig.Name, preferedHost, time.Now().Format(time.RFC3339))
+ }
bulkSizeInByte := processor.config.BulkConfig.GetBulkSizeInBytes()
//check slice
for sliceID := 0; sliceID < processor.config.NumOfSlices; sliceID++ {
@@ -460,50 +520,112 @@ func (processor *BulkIndexingProcessor) NewBulkWorker(parentContext *pipeline.Co
return
}
- processor.Lock()
- v2, exists := processor.inFlightQueueConfigs.Load(key)
- if exists {
+ var workerID = util.GetUUID()
+ v2, reserved := processor.reserveInFlightQueue(key, workerID)
+ if !reserved {
if global.Env().IsDebug {
log.Tracef("[%v], queue [%v], slice_id:%v has more then one consumer, key:%v,v:%v", preferedHost, qConfig.ID, sliceID, key, v2)
}
- processor.Unlock()
continue
- } else {
- var workerID = util.GetUUID()
- log.Debugf("starting worker:[%v], queue:[%v], slice_id:%v, host:[%v]", workerID, qConfig.Name, sliceID, preferedHost)
-
- ctx1 := &pipeline.Context{}
- ctx1.Set("key", key)
- ctx1.Set("workerID", workerID)
- ctx1.Set("sliceID", sliceID)
- ctx1.Set("numOfSlices", processor.config.NumOfSlices)
- ctx1.Set("tag", preferedHost)
- ctx1.Set("qConfig", qConfig)
- ctx1.Set("host", preferedHost)
- ctx1.Set("bulkSizeInByte", bulkSizeInByte)
- err := processor.pool.Submit(&pipeline.Task{
- Handler: func(ctx *pipeline.Context, v ...interface{}) {
- key := ctx.MustGetString("key")
- workerID := ctx.MustGetString("workerID")
- host := ctx.MustGetString("host")
- sliceID := ctx.MustGetInt("sliceID")
- tag := ctx.MustGetString("tag")
- numOfSlices := ctx.MustGetInt("numOfSlices")
- bulkSizeInByte := ctx.MustGetInt("bulkSizeInByte")
- qConfig := ctx.MustGet("qConfig").(*queue.QueueConfig)
- pCtx := v[0].(*pipeline.Context)
- processor.NewSlicedBulkWorker(pCtx, key, workerID, sliceID, numOfSlices, tag, bulkSizeInByte, qConfig, host)
- },
- Context: ctx1,
- Params: []interface{}{parentContext}, // 也可以在创建任务时设置参数
- })
- processor.Unlock()
- if err != nil {
- panic(err)
- }
- processor.wg.Add(1)
}
+
+ log.Tracef("starting worker:[%v], queue:[%v], slice_id:%v, host:[%v]", workerID, qConfig.Name, sliceID, preferedHost)
+
+ ctx1 := &pipeline.Context{}
+ ctx1.Set("key", key)
+ ctx1.Set("workerID", workerID)
+ ctx1.Set("sliceID", sliceID)
+ ctx1.Set("numOfSlices", processor.config.NumOfSlices)
+ ctx1.Set("tag", preferedHost)
+ ctx1.Set("qConfig", qConfig)
+ ctx1.Set("host", preferedHost)
+ ctx1.Set("bulkSizeInByte", bulkSizeInByte)
+ err := processor.pool.Submit(&pipeline.Task{
+ Handler: func(ctx *pipeline.Context, v ...interface{}) {
+ key := ctx.MustGetString("key")
+ workerID := ctx.MustGetString("workerID")
+ host := ctx.MustGetString("host")
+ sliceID := ctx.MustGetInt("sliceID")
+ tag := ctx.MustGetString("tag")
+ numOfSlices := ctx.MustGetInt("numOfSlices")
+ bulkSizeInByte := ctx.MustGetInt("bulkSizeInByte")
+ qConfig := ctx.MustGet("qConfig").(*queue.QueueConfig)
+ pCtx := v[0].(*pipeline.Context)
+ processor.NewSlicedBulkWorker(pCtx, key, workerID, sliceID, numOfSlices, tag, bulkSizeInByte, qConfig, host)
+ },
+ Context: ctx1,
+ Params: []interface{}{parentContext}, // 也可以在创建任务时设置参数
+ })
+ if err != nil {
+ processor.inFlightQueueConfigs.Delete(key)
+ processor.wg.Done()
+ panic(err)
+ }
+ }
+}
+
+func (processor *BulkIndexingProcessor) reserveInFlightQueue(key, workerID string) (interface{}, bool) {
+ processor.Lock()
+ defer processor.Unlock()
+
+ v, exists := processor.inFlightQueueConfigs.Load(key)
+ if exists {
+ return v, false
+ }
+
+ // Track workers by queue+slice so retries or queue re-discovery never start a second
+ // consumer for the same slice while the first one is still draining.
+ processor.inFlightQueueConfigs.Store(key, workerID)
+ processor.wg.Add(1)
+
+ return workerID, true
+}
+
+func (processor *BulkIndexingProcessor) hasInFlightQueue(queueID string) bool {
+ if _, ok := processor.inFlightQueueConfigs.Load(queueID); ok {
+ return true
+ }
+
+ queuePrefix := fmt.Sprintf("%v-", queueID)
+ hasInFlight := false
+ processor.inFlightQueueConfigs.Range(func(key, value interface{}) bool {
+ keyStr, ok := key.(string)
+ if ok && strings.HasPrefix(keyStr, queuePrefix) {
+ hasInFlight = true
+ return false
+ }
+ return true
+ })
+
+ return hasInFlight
+}
+
+func (processor *BulkIndexingProcessor) acquireQueueOwner(queueID string) bool {
+ owner, loaded := queueOwners.LoadOrStore(queueID, processor.id)
+ if !loaded {
+ return true
+ }
+
+ return owner == processor.id
+}
+
+func (processor *BulkIndexingProcessor) releaseQueueOwnerIfIdle(queueID string) {
+ if processor.hasInFlightQueue(queueID) {
+ return
+ }
+
+ owner, ok := queueOwners.Load(queueID)
+ if ok && owner == processor.id {
+ queueOwners.Delete(queueID)
+ }
+}
+
+func isIgnorableAcquireConsumerError(err error) bool {
+ if err == nil {
+ return false
}
+
+ return util.ContainStr(err.Error(), "already owning this topic")
}
var xxHashPool = sync.Pool{
@@ -549,8 +671,6 @@ func (processor *BulkIndexingProcessor) getConsumerConfig(queueID, consumerName
}
func (processor *BulkIndexingProcessor) NewSlicedBulkWorker(ctx *pipeline.Context, key, workerID string, sliceID, maxSlices int, tag string, bulkSizeInByte int, qConfig *queue.QueueConfig, host string) {
- processor.inFlightQueueConfigs.Store(key, workerID)
-
defer func() {
if !global.Env().IsDebug {
if r := recover(); r != nil {
@@ -571,6 +691,7 @@ func (processor *BulkIndexingProcessor) NewSlicedBulkWorker(ctx *pipeline.Contex
}
}
processor.inFlightQueueConfigs.Delete(key)
+ processor.releaseQueueOwnerIfIdle(qConfig.ID)
processor.wg.Done()
if global.Env().IsDebug {
log.Tracef("exit slice worker, worker:[%v], queue:%v, slice_id:%v, key:%v", workerID, qConfig.ID, sliceID, key)
@@ -600,12 +721,15 @@ func (processor *BulkIndexingProcessor) NewSlicedBulkWorker(ctx *pipeline.Contex
var consumerInstance queue.ConsumerAPI
consumerInstance, err = queue.AcquireConsumer(qConfig, consumerConfig, workerID)
if err != nil || consumerInstance == nil {
- if util.ContainStr(err.Error(), "already owning this topic") {
+ if isIgnorableAcquireConsumerError(err) {
if global.Env().IsDebug {
- log.Warnf("other consumer already owning this topic, queue:%v-%v, slice_id:%v", qConfig.Name, qConfig.ID, sliceID)
+ log.Warnf("skip duplicate consumer acquisition, queue:%v-%v, slice_id:%v, err:%v", qConfig.Name, qConfig.ID, sliceID, err)
}
return
}
+ if err == nil {
+ err = errors.New("failed to acquire queue consumer")
+ }
panic(err)
}
@@ -681,7 +805,9 @@ func (processor *BulkIndexingProcessor) NewSlicedBulkWorker(ctx *pipeline.Contex
log.Errorf("should not submit this bulk request, worker[%v], queue:[%v], slice:[%v], offset:[%v]->[%v],%v, msg:%v", workerID, qConfig.ID, sliceID, committedOffset, offset, err, mainBuf.GetMessageCount())
}
}
- log.Debugf("exit worker[%v], message count[%d], queue:[%v], slice_id:%v", workerID, mainBuf.GetMessageCount(), qConfig.ID, sliceID)
+ if global.Env().IsDebug {
+ log.Tracef("exit worker[%v], message count[%d], queue:[%v], slice_id:%v", workerID, mainBuf.GetMessageCount(), qConfig.ID, sliceID)
+ }
}()
if global.Env().IsDebug {
@@ -798,7 +924,13 @@ READ_DOCS:
consumerConfig.KeepActive()
messages, timeout, err := consumerInstance.FetchMessages(ctx1, consumerConfig.FetchMaxMessages)
stats.IncrementBy("queue", qConfig.ID+".msg_fetched_from_queue", int64(len(messages)))
- log.Debugf("slice worker, worker:[%v], [%v][%v][%v][%v] fetched message:%v,ctx:%v,timeout:%v,err:%v", workerID, qConfig.Name, consumerConfig.Group, consumerConfig.Name, sliceID, len(messages), ctx1.String(), timeout, err)
+ if err != nil || len(messages) > 0 {
+ if qConfig.Name == "bulk_requests" {
+ log.Tracef("slice worker, worker:[%v], [%v][%v][%v][%v] fetched message:%v,ctx:%v,timeout:%v,err:%v", workerID, qConfig.Name, consumerConfig.Group, consumerConfig.Name, sliceID, len(messages), ctx1.String(), timeout, err)
+ } else {
+ log.Debugf("slice worker, worker:[%v], [%v][%v][%v][%v] fetched message:%v,ctx:%v,timeout:%v,err:%v", workerID, qConfig.Name, consumerConfig.Group, consumerConfig.Name, sliceID, len(messages), ctx1.String(), timeout, err)
+ }
+ }
if err != nil {
if strings.Contains(err.Error(), "dirty_read") || err.Error() == "EOF" || err.Error() == "unexpected EOF" {
ctx.CancelTask()
@@ -905,6 +1037,10 @@ READ_DOCS:
mainBuf.WriteByteBuffer(pop.Data)
}
+ // Keep the in-memory offset aligned with the data already buffered.
+ // If the current message triggers an immediate flush, its NextOffset must be committed too.
+ offset = advanceBufferedOffset(pop.NextOffset)
+
if global.Env().IsDebug {
log.Tracef("slice worker, worker:[%v], message count: %v, size: %v", workerID, mainBuf.GetMessageCount(), util.ByteSize(uint64(mainBuf.GetMessageSize())))
}
@@ -949,7 +1085,7 @@ READ_DOCS:
if offset != nil && committedOffset != nil && !offset.Equals(*committedOffset) {
err := consumerInstance.CommitOffset(*offset)
if err != nil {
- log.Errorf("🔧 offset commit failed, worker:[%v], queue:[%v], slice:[%v], offset:[%v], err:%v", workerID, qConfig.Name, sliceID, *offset, err)
+ log.Errorf("offset commit failed, worker:[%v], queue:[%v], slice:[%v], offset:[%v], err:%v", workerID, qConfig.Name, sliceID, *offset, err)
panic(err)
}
@@ -958,27 +1094,18 @@ READ_DOCS:
}
// fix: update committedOffset immediately after successful commit, to ensure state consistency
committedOffset = offset
- log.Debugf("🔧 offset committed successfully, worker:[%v], queue:[%v], slice:[%v], offset:[%v]", workerID, qConfig.Name, sliceID, *offset)
- } else {
if global.Env().IsDebug {
- log.Debugf("🔧 offset not changed, skip commit, worker:[%v], queue:[%v], slice:[%v], offset:[%v], committed:[%v]", workerID, qConfig.Name, sliceID, offset, committedOffset)
+ log.Tracef("offset committed, worker:[%v], queue:[%v], slice:[%v], offset:[%v]", workerID, qConfig.Name, sliceID, *offset)
}
+ } else {
+ // skip unchanged offset silently to avoid noisy debug logs
}
- // fix: this code is moved to loop outside (line 970) to avoid updating offset in the middle of bulk submission
- // offset = &pop.NextOffset
}
} else {
log.Errorf("should not submit this bulk request, worker[%v], queue:[%v], slice:[%v], offset:[%v]->[%v],%v, msg:%v", workerID, qConfig.ID, sliceID, committedOffset, offset, err, msgCount)
}
}
-
- // fix: update offset after each message is processed, to ensure progress sync with actual processing
- // so even if it crashes before submission, it will not repeat processing messages written to the buffer after restart
- offset = &pop.NextOffset
}
-
- // fix: remove this code to avoid overwriting the updated offset in the loop
- // offset = &ctx1.NextOffset
}
if time.Since(lastCommit) > idleDuration && mainBuf.GetMessageSize() > 0 {
@@ -1002,7 +1129,7 @@ CLEAN_BUFFER:
}
if global.Env().IsDebug {
- log.Debugf("cleanup buffer, queue:[%v], slice_id:%v, offset [%v]-[%v], bulk failed (host: %v, err: %v)", qConfig.ID, sliceID, committedOffset, offset, host, err)
+ log.Tracef("cleanup buffer, queue:[%v], slice_id:%v, offset [%v]-[%v], bulk failed (host: %v, err: %v)", qConfig.ID, sliceID, committedOffset, offset, host, err)
}
lastCommit = time.Now()
// check bulk result, if ok, then commit offset, or retry non-200 requests, or save failure offset
@@ -1167,6 +1294,11 @@ func appendStrArr(arr []string, size int, elems []string) []string {
return append(arr, elems...)
}
+func advanceBufferedOffset(nextOffset queue.Offset) *queue.Offset {
+ next := nextOffset
+ return &next
+}
+
func (processor *BulkIndexingProcessor) getElasticsearchMetadata(qConfig *queue.QueueConfig) (string, *elastic.ElasticsearchMetadata) {
elasticsearch, ok := qConfig.Labels["elasticsearch"]
diff --git a/plugins/elastic/bulk_indexing/bulk_indexing_test.go b/plugins/elastic/bulk_indexing/bulk_indexing_test.go
index e37b0ffe0..f462d8f34 100644
--- a/plugins/elastic/bulk_indexing/bulk_indexing_test.go
+++ b/plugins/elastic/bulk_indexing/bulk_indexing_test.go
@@ -28,9 +28,13 @@
package bulk_indexing
import (
+ stdErrors "errors"
"github.com/OneOfOne/xxhash"
"github.com/stretchr/testify/assert"
+ "infini.sh/framework/core/queue"
+ "sync"
"testing"
+ "time"
)
func TestXXHash(t *testing.T) {
@@ -84,3 +88,90 @@ func TestXXHash(t *testing.T) {
}
}
+
+func TestReserveInFlightQueue(t *testing.T) {
+ processor := &BulkIndexingProcessor{}
+
+ current, reserved := processor.reserveInFlightQueue("queue-0", "worker-1")
+ assert.True(t, reserved)
+ assert.Equal(t, "worker-1", current)
+
+ stored, exists := processor.inFlightQueueConfigs.Load("queue-0")
+ assert.True(t, exists)
+ assert.Equal(t, "worker-1", stored)
+
+ current, reserved = processor.reserveInFlightQueue("queue-0", "worker-2")
+ assert.False(t, reserved)
+ assert.Equal(t, "worker-1", current)
+
+ processor.inFlightQueueConfigs.Delete("queue-0")
+ processor.wg.Done()
+}
+
+func TestHasInFlightQueue(t *testing.T) {
+ processor := &BulkIndexingProcessor{}
+
+ assert.False(t, processor.hasInFlightQueue("queue-0"))
+
+ processor.inFlightQueueConfigs.Store("queue-0-0", "worker-1")
+ assert.True(t, processor.hasInFlightQueue("queue-0"))
+
+ processor.inFlightQueueConfigs.Delete("queue-0-0")
+ assert.False(t, processor.hasInFlightQueue("queue-0"))
+}
+
+func TestAcquireQueueOwner(t *testing.T) {
+ queueOwners = sync.Map{}
+
+ processor1 := &BulkIndexingProcessor{id: "processor-1"}
+ processor2 := &BulkIndexingProcessor{id: "processor-2"}
+
+ assert.True(t, processor1.acquireQueueOwner("queue-0"))
+ assert.True(t, processor1.acquireQueueOwner("queue-0"))
+ assert.False(t, processor2.acquireQueueOwner("queue-0"))
+
+ queueOwners = sync.Map{}
+}
+
+func TestReleaseQueueOwnerIfIdle(t *testing.T) {
+ queueOwners = sync.Map{}
+
+ processor := &BulkIndexingProcessor{id: "processor-1"}
+ assert.True(t, processor.acquireQueueOwner("queue-0"))
+
+ processor.inFlightQueueConfigs.Store("queue-0-0", "worker-1")
+ processor.releaseQueueOwnerIfIdle("queue-0")
+ _, exists := queueOwners.Load("queue-0")
+ assert.True(t, exists)
+
+ processor.inFlightQueueConfigs.Delete("queue-0-0")
+ processor.releaseQueueOwnerIfIdle("queue-0")
+ _, exists = queueOwners.Load("queue-0")
+ assert.False(t, exists)
+}
+
+func TestIsIgnorableAcquireConsumerError(t *testing.T) {
+ assert.True(t, isIgnorableAcquireConsumerError(stdErrors.New("already owning this topic")))
+ assert.False(t, isIgnorableAcquireConsumerError(stdErrors.New("the consumer is in fighting list")))
+ assert.False(t, isIgnorableAcquireConsumerError(stdErrors.New("some other error")))
+ assert.False(t, isIgnorableAcquireConsumerError(nil))
+}
+
+func TestShouldQuitActiveQueueDetection(t *testing.T) {
+ assert.False(t, shouldQuitActiveQueueDetection(time.Now(), 5*time.Second, 5*time.Second, 0))
+ assert.False(t, shouldQuitActiveQueueDetection(time.Now().Add(-10*time.Second), 5*time.Second, 5*time.Second, 1))
+ assert.False(t, shouldQuitActiveQueueDetection(time.Now().Add(-9*time.Second), 5*time.Second, 5*time.Second, 0))
+ assert.True(t, shouldQuitActiveQueueDetection(time.Now().Add(-10*time.Second), 5*time.Second, 5*time.Second, 0))
+ assert.True(t, shouldQuitActiveQueueDetection(time.Now().Add(-5*time.Second), 5*time.Second, 0, 0))
+}
+
+func TestAdvanceBufferedOffsetUsesCurrentMessageNextOffset(t *testing.T) {
+ previousCommitted := queue.NewOffsetWithVersion(0, 100, 1)
+ currentNext := queue.NewOffsetWithVersion(0, 200, 1)
+
+ offset := advanceBufferedOffset(currentNext)
+
+ assert.NotNil(t, offset)
+ assert.True(t, offset.Equals(currentNext))
+ assert.False(t, offset.Equals(previousCommitted))
+}