Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/agent/loop.go
Original file line number Diff line number Diff line change
Expand Up @@ -1675,7 +1675,7 @@ func (al *AgentLoop) runTurn(ctx context.Context, ts *turnState) (turnResult, er
if !ts.opts.NoHistory && (strings.TrimSpace(ts.userMessage) != "" || len(ts.media) > 0) {
rootMsg := providers.Message{
Role: "user",
Content: ts.userMessage,
Content: resolvedCurrentUserMessageContent(messages, ts.userMessage),
Media: append([]string(nil), ts.media...),
}
if len(rootMsg.Media) > 0 {
Expand Down
20 changes: 18 additions & 2 deletions pkg/agent/loop_media.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ import (
)

// resolveMediaRefs resolves media:// refs in messages.
// Images are base64-encoded into the Media array for multimodal LLMs.
// Images are base64-encoded into the Media array for multimodal LLMs and also
// have their local path injected into Content so tools can read the file.
// Non-image files (documents, audio, video) have their local path injected
// into Content so the agent can access them via file tools like read_file.
// Returns a new slice; original messages are not mutated.
Expand Down Expand Up @@ -68,6 +69,7 @@ func resolveMediaRefs(messages []providers.Message, store media.MediaStore, maxS
mime := detectMIME(localPath, meta)

if strings.HasPrefix(mime, "image/") {
pathTags = append(pathTags, buildPathTag(mime, localPath))
dataURL := encodeImageToDataURL(localPath, mime, info, maxSize)
if dataURL != "" {
resolved = append(resolved, dataURL)
Expand Down Expand Up @@ -105,6 +107,15 @@ func buildArtifactTags(store media.MediaStore, refs []string) []string {
return tags
}

func resolvedCurrentUserMessageContent(messages []providers.Message, fallback string) string {
for i := len(messages) - 1; i >= 0; i-- {
if messages[i].Role == "user" {
return messages[i].Content
}
}
return fallback
}

// detectMIME determines the MIME type from metadata or magic-bytes detection.
// Returns empty string if detection fails.
func detectMIME(localPath string, meta media.MediaMeta) string {
Expand Down Expand Up @@ -160,9 +171,12 @@ func encodeImageToDataURL(localPath, mime string, info os.FileInfo, maxSize int)
}

// buildPathTag creates a structured tag exposing the local file path.
// Tag type is derived from MIME: [audio:/path], [video:/path], or [file:/path].
// Tag type is derived from MIME: [image:/path], [audio:/path],
// [video:/path], or [file:/path].
func buildPathTag(mime, localPath string) string {
switch {
case strings.HasPrefix(mime, "image/"):
return "[image:" + localPath + "]"
case strings.HasPrefix(mime, "audio/"):
return "[audio:" + localPath + "]"
case strings.HasPrefix(mime, "video/"):
Expand All @@ -178,6 +192,8 @@ func injectPathTags(content string, tags []string) string {
for _, tag := range tags {
var generic string
switch {
case strings.HasPrefix(tag, "[image:"):
generic = "[image: photo]"
case strings.HasPrefix(tag, "[audio:"):
generic = "[audio]"
case strings.HasPrefix(tag, "[video:"):
Expand Down
92 changes: 92 additions & 0 deletions pkg/agent/loop_media_path_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package agent

import (
"os"
"path/filepath"
"strings"
"testing"

"github.com/sipeed/picoclaw/pkg/media"
"github.com/sipeed/picoclaw/pkg/providers"
)

func TestResolveMediaRefsImageAddsDataURLAndPathTag(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "photo.png")
if err := os.WriteFile(path, []byte("not a real png, but metadata supplies MIME"), 0o600); err != nil {
t.Fatalf("WriteFile() error = %v", err)
}

store := media.NewFileMediaStore()
ref, err := store.Store(path, media.MediaMeta{ContentType: "image/png"}, "scope")
if err != nil {
t.Fatalf("Store() error = %v", err)
}

result := resolveMediaRefs([]providers.Message{{
Role: "user",
Content: "[image: photo]",
Media: []string{ref},
}}, store, 1024*1024)

if len(result) != 1 {
t.Fatalf("result len = %d, want 1", len(result))
}
if len(result[0].Media) != 1 || !strings.HasPrefix(result[0].Media[0], "data:image/png;base64,") {
t.Fatalf("resolved media = %#v, want data:image/png", result[0].Media)
}
if want := "[image:" + path + "]"; !strings.Contains(result[0].Content, want) {
t.Fatalf("content = %q, want path tag %q", result[0].Content, want)
}
if strings.Contains(result[0].Content, "[image: photo]") {
t.Fatalf("content still contains generic image tag: %q", result[0].Content)
}
}

func TestResolvedCurrentUserMessageContentReturnsResolvedLastUser(t *testing.T) {
messages := []providers.Message{
{Role: "system", Content: "system"},
{Role: "user", Content: "old"},
{Role: "assistant", Content: "answer"},
{Role: "user", Content: "new [image:/tmp/picoclaw_media/a.jpg]"},
}

got := resolvedCurrentUserMessageContent(messages, "fallback")
want := "new [image:/tmp/picoclaw_media/a.jpg]"
if got != want {
t.Fatalf("resolvedCurrentUserMessageContent() = %q, want %q", got, want)
}
}

func TestResolvedImageMessageContentCanRemainInHistoryForFollowupText(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "photo.jpg")
if err := os.WriteFile(path, []byte("jpeg bytes supplied by Feishu"), 0o600); err != nil {
t.Fatalf("WriteFile() error = %v", err)
}

store := media.NewFileMediaStore()
ref, err := store.Store(path, media.MediaMeta{ContentType: "image/jpeg"}, "scope")
if err != nil {
t.Fatalf("Store() error = %v", err)
}

resolved := resolveMediaRefs([]providers.Message{{
Role: "user",
Content: "[image: photo]",
Media: []string{ref},
}}, store, 1024*1024)
historyContent := resolvedCurrentUserMessageContent(resolved, "[image: photo]")
wantPathTag := "[image:" + path + "]"
if !strings.Contains(historyContent, wantPathTag) {
t.Fatalf("history content = %q, want path tag %q", historyContent, wantPathTag)
}

followupMessages := []providers.Message{
{Role: "user", Content: historyContent},
{Role: "user", Content: "\u628a\u8fd9\u4e2a\u56fe\u7247\u8bc4\u8bae\u5230 issue"},
}
if !strings.Contains(followupMessages[0].Content, wantPathTag) {
t.Fatalf("follow-up history lost path tag: %#v", followupMessages)
}
}
59 changes: 42 additions & 17 deletions pkg/agent/loop_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -963,6 +963,45 @@ type artifactThenSendProvider struct {
calls int
}

func extractToolResultArtifactPath(content string) string {
prefixes := []string{"[image:", "[audio:", "[video:", "[file:"}
for _, prefix := range prefixes {
start := strings.Index(content, prefix)
if start < 0 {
continue
}

rest := content[start+len(prefix):]
end := strings.Index(rest, "]")
if end <= 0 {
continue
}

artifactPath := rest[:end]
if strings.TrimSpace(artifactPath) == "" {
continue
}
return artifactPath
}

return ""
}

func (m *artifactThenSendProvider) getArtifactPath(messages []providers.Message) string {
for i := len(messages) - 1; i >= 0; i-- {
if messages[i].Role != "tool" {
continue
}

artifactPath := extractToolResultArtifactPath(messages[i].Content)
if artifactPath != "" {
return artifactPath
}
}

return ""
}

func (m *artifactThenSendProvider) Chat(
ctx context.Context,
messages []providers.Message,
Expand All @@ -983,23 +1022,7 @@ func (m *artifactThenSendProvider) Chat(
}, nil
}

var artifactPath string
for i := len(messages) - 1; i >= 0; i-- {
if messages[i].Role != "tool" {
continue
}
start := strings.Index(messages[i].Content, "[file:")
if start < 0 {
continue
}
rest := messages[i].Content[start+len("[file:"):]
end := strings.Index(rest, "]")
if end < 0 {
continue
}
artifactPath = rest[:end]
break
}
artifactPath := m.getArtifactPath(messages)
if artifactPath == "" {
return nil, fmt.Errorf("provider did not receive artifact path in tool result")
}
Expand Down Expand Up @@ -2733,6 +2756,8 @@ func TestResolveMediaRefs_MixedImageAndFile(t *testing.T) {
t.Fatal("expected image to be base64 encoded")
}
expectedContent := "check these [file:" + pdfPath + "]"
expectedImageTag := "[image:" + pngPath + "]"
expectedContent += " " + expectedImageTag
if result[0].Content != expectedContent {
t.Fatalf("expected content %q, got %q", expectedContent, result[0].Content)
}
Expand Down
11 changes: 9 additions & 2 deletions pkg/agent/steering_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1087,10 +1087,17 @@ func TestAgentLoop_Continue_PreservesSteeringMedia(t *testing.T) {

foundResolvedMedia := false
for _, msg := range msgs {
if msg.Role != "user" || msg.Content != "describe this image" || len(msg.Media) != 1 {
if msg.Role != "user" || len(msg.Media) != 1 {
continue
}
if strings.HasPrefix(msg.Media[0], "data:image/png;base64,") {
if strings.HasPrefix(msg.Content, "describe this image") &&
strings.HasPrefix(msg.Media[0], "data:image/png;base64,") {
foundResolvedMedia = true
break
}

wantTag := "[image:" + pngPath + "]"
if strings.Contains(msg.Content, wantTag) {
foundResolvedMedia = true
break
}
Expand Down
72 changes: 72 additions & 0 deletions pkg/channels/feishu/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,78 @@ func extractFileKey(content string) string { return extractJSONStringField(conte
// extractFileName extracts the file_name from a Feishu file message content JSON.
func extractFileName(content string) string { return extractJSONStringField(content, "file_name") }

// extractPostImageKeys extracts image_key values from Feishu rich text post content.
// Format: {"title":"","content":[[{"tag":"img","image_key":"img_xxx"}]]}
func extractPostImageKeys(content string) []string {
var payload struct {
Content [][]map[string]any `json:"content"`
}
if err := json.Unmarshal([]byte(content), &payload); err != nil {
return nil
}

var keys []string
for _, line := range payload.Content {
for _, elem := range line {
if tag, _ := elem["tag"].(string); tag != "img" {
continue
}
if key, _ := elem["image_key"].(string); key != "" {
keys = append(keys, key)
}
}
}
return keys
}

// extractPostText flattens Feishu rich text post content into plain text.
// It handles the small set of rich-text tags needed for inbound instructions;
// image materialization is handled separately.
func extractPostText(content string) string {
var payload struct {
Title string `json:"title"`
Content [][]map[string]any `json:"content"`
}
if err := json.Unmarshal([]byte(content), &payload); err != nil {
return ""
}

var lines []string
if title := strings.TrimSpace(payload.Title); title != "" {
lines = append(lines, title)
}
for _, line := range payload.Content {
var b strings.Builder
for _, elem := range line {
switch tag, _ := elem["tag"].(string); tag {
case "text", "a":
b.WriteString(postStringField(elem, "text"))
case "at":
name := postStringField(elem, "user_name")
if name == "" {
name = postStringField(elem, "text")
}
if name != "" {
b.WriteString("@")
b.WriteString(name)
}
}
}
if text := strings.TrimSpace(b.String()); text != "" {
lines = append(lines, text)
}
}
return strings.TrimSpace(strings.Join(lines, "\n"))
}

func postStringField(m map[string]any, key string) string {
if m == nil {
return ""
}
value, _ := m[key].(string)
return value
}

// stripMentionPlaceholders removes @_user_N placeholders from the text content.
// These are inserted by Feishu when users @mention someone in a message.
func stripMentionPlaceholders(content string, mentions []*larkim.MentionEvent) string {
Expand Down
Loading
Loading