From b1e1adac10aeaffc05edd604bf9c93b14e14b78f Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Fri, 26 Jun 2026 08:10:04 -0500 Subject: [PATCH 1/5] fix(parser): classify Codex goal contexts as system Codex goal context records should be treated as archived system context instead of ordinary transcript content, otherwise legacy metadata can leak into transcript rendering and export/search surfaces. Keeping the parser classification explicit gives downstream filtering a stable boundary for hiding those rows without losing the archived messages. - fix(transcript): hide legacy Codex goal context rows - merge: sync with origin/main --- frontend/src/lib/utils/messages.test.ts | 5 ++ frontend/src/lib/utils/messages.ts | 2 + internal/db/db.go | 5 +- internal/db/db_test.go | 7 ++- internal/db/search.go | 2 + internal/db/search_test.go | 2 + internal/parser/codex.go | 22 +++++--- internal/parser/codex_parser_test.go | 63 +++++++++++++++++++++ internal/server/export.go | 18 +++++- internal/server/export_test.go | 73 +++++++++++++++++++++++++ 10 files changed, 188 insertions(+), 11 deletions(-) diff --git a/frontend/src/lib/utils/messages.test.ts b/frontend/src/lib/utils/messages.test.ts index 032ae5c39..ee4e4d5d6 100644 --- a/frontend/src/lib/utils/messages.test.ts +++ b/frontend/src/lib/utils/messages.test.ts @@ -58,6 +58,11 @@ describe("isSystemMessage", () => { ["command-name", "/commit"], ["local-command", "ok"], ["stop hook", "Stop hook feedback: blocked"], + ["legacy goal context", "\n\tstate"], + [ + "codex internal goal context", + ' state', + ], ])("detects prefix-based system message: %s", (_label, content) => { expect(isSystemMessage(msg({ content }))).toBe(true); }); diff --git a/frontend/src/lib/utils/messages.ts b/frontend/src/lib/utils/messages.ts index 26044ab9f..09aef855e 100644 --- a/frontend/src/lib/utils/messages.ts +++ b/frontend/src/lib/utils/messages.ts @@ -8,6 +8,8 @@ const SYSTEM_MSG_PREFIXES = [ "", "", + '', ]; // Subtypes the Claude parser promotes into visible system messages diff --git a/internal/db/db.go b/internal/db/db.go index 8c7c2b9dd..a6f937c65 100644 --- a/internal/db/db.go +++ b/internal/db/db.go @@ -245,6 +245,9 @@ import ( // backfill. Re-parsing persists estimated usage events for existing // aggregate-only Kimi sessions and preserves explicit native event // model names instead of the proxy fallback.) +// (56: Codex goal-continuation context wrappers are filtered from +// persisted messages and user_message_count. Existing Codex rows need +// re-parsing so synthetic /goal continuation records are removed.) // (54: Antigravity .db sessions record a schema-fingerprint // source_version. Re-parsing populates source_version on existing // Antigravity IDE and CLI rows so "which agy release produced this @@ -258,7 +261,7 @@ import ( // (51: Gemini cumulative-to-delta token reparse.) // (17: Codex template filtering.) // (16: system messages.) -const dataVersion = 55 +const dataVersion = 56 const tokenCoverageRepairStatsKey = "token_coverage_repair_v1" diff --git a/internal/db/db_test.go b/internal/db/db_test.go index a92393e2b..96c289b13 100644 --- a/internal/db/db_test.go +++ b/internal/db/db_test.go @@ -696,10 +696,15 @@ func TestMigration_ToolResultEventsTable(t *testing.T) { } func TestCurrentDataVersionKimiUsageEvents(t *testing.T) { - assert.Equal(t, 55, CurrentDataVersion(), + assert.GreaterOrEqual(t, CurrentDataVersion(), 55, "Kimi persisted usage events require a data version bump") } +func TestCurrentDataVersionGoalContextFiltering(t *testing.T) { + assert.Equal(t, 56, CurrentDataVersion(), + "Codex goal-context filtering requires a data version bump") +} + func TestInsertMessages_PreservesToolResultEvents(t *testing.T) { d := testDB(t) insertSession(t, d, "s-events", "proj") diff --git a/internal/db/search.go b/internal/db/search.go index 88c9ce1bf..8b6803519 100644 --- a/internal/db/search.go +++ b/internal/db/search.go @@ -25,6 +25,8 @@ var SystemMsgPrefixes = []string{ "", "", + ``, } // SystemPrefixSQL returns a SQL clause that excludes user messages diff --git a/internal/db/search_test.go b/internal/db/search_test.go index a0b12b9a9..2a1f40ce8 100644 --- a/internal/db/search_test.go +++ b/internal/db/search_test.go @@ -22,6 +22,8 @@ func TestIsSystemPrefixed(t *testing.T) { {"task-notification prefix", "done", "user", true}, {"leading whitespace then prefix", "\n\t /foo", "user", true}, {"bom then prefix", "\uFEFFx", "user", true}, + {"legacy goal context prefix", "\n\tstate", "user", true}, + {"codex internal goal context prefix", `state`, "user", true}, {"assistant role is never system-prefixed", SystemMsgPrefixes[0], "assistant", false}, {"prefix mid-content does not match", "see later", "user", false}, } diff --git a/internal/parser/codex.go b/internal/parser/codex.go index f4b872269..d39a53bc8 100644 --- a/internal/parser/codex.go +++ b/internal/parser/codex.go @@ -1739,15 +1739,23 @@ func IsIncrementalFullParseFallback(err error) bool { errors.Is(err, ErrClaudeIncrementalNeedsFullParse) } +var codexSystemMessagePrefixes = []string{ + "# AGENTS.md", + "", + "", + "", +} + func isCodexSystemMessage(content string) bool { trimmed := strings.TrimSpace(content) - return strings.HasPrefix(content, "# AGENTS.md") || - strings.HasPrefix(content, "") || - strings.HasPrefix(content, "") || - isCodexTurnAbortedMessage(content) || - strings.HasPrefix(trimmed, "") || - isCodexSubagentNotification(content) || - isCodexGoalContext(content) + for _, prefix := range codexSystemMessagePrefixes { + if strings.HasPrefix(trimmed, prefix) { + return true + } + } + return isCodexTurnAbortedMessage(trimmed) || + isCodexSubagentNotification(trimmed) || + isCodexGoalContext(trimmed) } // isCodexGoalContext reports whether content is a Codex /goal diff --git a/internal/parser/codex_parser_test.go b/internal/parser/codex_parser_test.go index 8cb3da38c..886fd6e3c 100644 --- a/internal/parser/codex_parser_test.go +++ b/internal/parser/codex_parser_test.go @@ -1398,6 +1398,34 @@ func TestParseCodexSession_EdgeCases(t *testing.T) { assert.Equal(t, "Actual user message", msgs[0].Content) }) + t.Run("skips goal continuation context wrappers", func(t *testing.T) { + currentGoal := "\n" + + "Continue working toward the active thread goal.\n" + + "" + legacyGoal := "\n" + + "Continue working toward the active thread goal.\n" + + "" + content := testjsonl.JoinJSONL( + testjsonl.CodexSessionMetaJSON("abc", "/tmp", "user", tsEarly), + testjsonl.CodexMsgJSON("user", "\n\t"+currentGoal, tsEarlyS1), + testjsonl.CodexMsgJSON("user", "Actual user message", "2024-01-01T10:00:02Z"), + testjsonl.CodexMsgJSON("assistant", "Assistant reply", "2024-01-01T10:00:03Z"), + testjsonl.CodexMsgJSON("user", " "+legacyGoal, "2024-01-01T10:00:04Z"), + testjsonl.CodexMsgJSON("user", "Follow-up user message", "2024-01-01T10:00:05Z"), + ) + sess, msgs := runCodexParserTest(t, "test.jsonl", content, false) + require.NotNil(t, sess) + require.Len(t, msgs, 3) + assert.Equal(t, "Actual user message", sess.FirstMessage) + assert.Equal(t, 2, sess.UserMessageCount) + assert.Equal(t, RoleUser, msgs[0].Role) + assert.Equal(t, "Actual user message", msgs[0].Content) + assert.Equal(t, RoleAssistant, msgs[1].Role) + assert.Equal(t, "Assistant reply", msgs[1].Content) + assert.Equal(t, RoleUser, msgs[2].Role) + assert.Equal(t, "Follow-up user message", msgs[2].Content) + }) + // Codex injects skill template content as role=user JSONL // entries when the model invokes a skill. These look like // follow-up user turns to a naive count, which inflates @@ -2142,6 +2170,41 @@ func TestParseCodexSessionFrom_SystemMessageDoesNotRequireFullParse(t *testing.T assert.False(t, endedAt.IsZero()) } +func TestParseCodexSessionFrom_GoalContextDoesNotRequireFullParse(t *testing.T) { + t.Parallel() + + initial := testjsonl.JoinJSONL( + testjsonl.CodexSessionMetaJSON("inc-goal", "/tmp", "codex_cli_rs", tsEarly), + testjsonl.CodexMsgJSON("user", "hello", tsEarlyS1), + ) + path := createTestFile(t, "codex-goal-inc.jsonl", initial) + + info, err := os.Stat(path) + require.NoError(t, err) + offset := info.Size() + + currentGoal := "\n" + + "Continue working toward the active thread goal.\n" + + "" + legacyGoal := "\n" + + "Continue working toward the active thread goal.\n" + + "" + + f, err := os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0o644) + require.NoError(t, err) + _, err = f.WriteString(testjsonl.JoinJSONL( + testjsonl.CodexMsgJSON("user", currentGoal, tsEarlyS5), + testjsonl.CodexMsgJSON("user", "\n "+legacyGoal, tsLate), + )) + require.NoError(t, err) + require.NoError(t, f.Close()) + + newMsgs, endedAt, _, err := ParseCodexSessionFrom(path, offset, 1, false) + require.NoError(t, err) + assert.Empty(t, newMsgs) + assert.False(t, endedAt.IsZero()) +} + func TestParseCodexSessionFrom_RunningNotificationRequiresFullParse(t *testing.T) { t.Parallel() diff --git a/internal/server/export.go b/internal/server/export.go index 299a60f77..ee3352398 100644 --- a/internal/server/export.go +++ b/internal/server/export.go @@ -628,6 +628,8 @@ footer a { func generateExportHTML( session *db.Session, msgs []db.Message, ) string { + msgs = filterExportHTMLMessages(msgs) + startedAt := "" if session.StartedAt != nil { startedAt = formatTimestamp(*session.StartedAt) @@ -636,7 +638,7 @@ func generateExportHTML( data := exportData{ Project: session.Project, Agent: agentDisplayName(session.Agent), - MessageCount: session.MessageCount, + MessageCount: len(msgs), StartedAt: startedAt, Messages: make([]exportMessage, len(msgs)), } @@ -670,6 +672,17 @@ func generateExportHTML( return b.String() } +func filterExportHTMLMessages(msgs []db.Message) []db.Message { + filtered := make([]db.Message, 0, len(msgs)) + for _, m := range msgs { + if db.IsSystemPrefixed(m.Content, m.Role) { + continue + } + filtered = append(filtered, m) + } + return filtered +} + func generateInsightExportHTML(insight *db.Insight) string { data := insightExportData{ Title: insightExportTitle(insight), @@ -752,7 +765,8 @@ func focusedExportOrdinals(msgs []db.Message) map[int]bool { continue } - if m.IsSystem || isThinkingOnly(m.Content) { + if m.IsSystem || db.IsSystemPrefixed(m.Content, m.Role) || + isThinkingOnly(m.Content) { continue } diff --git a/internal/server/export_test.go b/internal/server/export_test.go index fcacf267a..fbfbe44d9 100644 --- a/internal/server/export_test.go +++ b/internal/server/export_test.go @@ -498,6 +498,58 @@ func TestGenerateExportHTML_TranscriptModeControls(t *testing.T) { }) } +func TestGenerateExportHTML_OmitsGoalContextRows(t *testing.T) { + t.Parallel() + session := testSession(func(s *db.Session) { + s.MessageCount = 4 + }) + currentGoal := "\n" + + "Continue working toward the active thread goal.\n" + + "" + legacyGoal := "\n" + + "Continue working toward the active thread goal.\n" + + "" + msgs := []db.Message{ + { + SessionID: "test-id", Ordinal: 0, + Role: "user", Content: "Actual user message", + Timestamp: "2025-01-15T10:00:00Z", + }, + { + SessionID: "test-id", Ordinal: 1, + Role: "user", Content: "\n\t" + currentGoal, + Timestamp: "2025-01-15T10:00:01Z", + }, + { + SessionID: "test-id", Ordinal: 2, + Role: "user", Content: " " + legacyGoal, + Timestamp: "2025-01-15T10:00:02Z", + }, + { + SessionID: "test-id", Ordinal: 3, + Role: "assistant", Content: "Assistant reply", + Timestamp: "2025-01-15T10:00:03Z", + }, + } + + html := generateExportHTML(session, msgs) + + assertContainsAll(t, html, []string{ + "2 messages", + `class="message user" data-ordinal="0"`, + `class="message assistant" data-ordinal="3"`, + "Actual user message", + "Assistant reply", + }) + assertContainsNone(t, html, []string{ + `data-ordinal="1"`, + `data-ordinal="2"`, + "", + "Continue working toward the active thread goal.", + }) +} + func TestFocusedExportOrdinals(t *testing.T) { t.Parallel() tests := []struct { @@ -583,6 +635,27 @@ func TestFocusedExportOrdinals(t *testing.T) { }, want: []int{0, 1, 4}, }, + { + name: "ignores system-prefixed goal contexts", + msgs: []db.Message{ + exportUserMsg(0), + exportAssistantMsg(1, "draft"), + { + SessionID: "test-id", + Ordinal: 2, + Role: "user", + Content: `state`, + }, + { + SessionID: "test-id", + Ordinal: 3, + Role: "user", + Content: "\n\tstate", + }, + exportAssistantMsg(4, "final"), + }, + want: []int{0, 4}, + }, { name: "keeps answer before compact boundary", msgs: []db.Message{ From b52dd73ea39bd6a0da00c5099b22ba8e06fd2336 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 26 Jun 2026 13:21:55 -0500 Subject: [PATCH 2/5] chore(parser): drop duplicate Codex goal context changes The parser-level /goal classification already landed in 9bee8a3c, so this branch should only carry the stale persisted-row repair path. Removing the repeated parser refactor and duplicate tests keeps the remaining diff focused on data-version resync and runtime filtering for legacy stored messages. --- internal/parser/codex.go | 22 ++++------ internal/parser/codex_parser_test.go | 63 ---------------------------- 2 files changed, 7 insertions(+), 78 deletions(-) diff --git a/internal/parser/codex.go b/internal/parser/codex.go index d39a53bc8..f4b872269 100644 --- a/internal/parser/codex.go +++ b/internal/parser/codex.go @@ -1739,23 +1739,15 @@ func IsIncrementalFullParseFallback(err error) bool { errors.Is(err, ErrClaudeIncrementalNeedsFullParse) } -var codexSystemMessagePrefixes = []string{ - "# AGENTS.md", - "", - "", - "", -} - func isCodexSystemMessage(content string) bool { trimmed := strings.TrimSpace(content) - for _, prefix := range codexSystemMessagePrefixes { - if strings.HasPrefix(trimmed, prefix) { - return true - } - } - return isCodexTurnAbortedMessage(trimmed) || - isCodexSubagentNotification(trimmed) || - isCodexGoalContext(trimmed) + return strings.HasPrefix(content, "# AGENTS.md") || + strings.HasPrefix(content, "") || + strings.HasPrefix(content, "") || + isCodexTurnAbortedMessage(content) || + strings.HasPrefix(trimmed, "") || + isCodexSubagentNotification(content) || + isCodexGoalContext(content) } // isCodexGoalContext reports whether content is a Codex /goal diff --git a/internal/parser/codex_parser_test.go b/internal/parser/codex_parser_test.go index 886fd6e3c..8cb3da38c 100644 --- a/internal/parser/codex_parser_test.go +++ b/internal/parser/codex_parser_test.go @@ -1398,34 +1398,6 @@ func TestParseCodexSession_EdgeCases(t *testing.T) { assert.Equal(t, "Actual user message", msgs[0].Content) }) - t.Run("skips goal continuation context wrappers", func(t *testing.T) { - currentGoal := "\n" + - "Continue working toward the active thread goal.\n" + - "" - legacyGoal := "\n" + - "Continue working toward the active thread goal.\n" + - "" - content := testjsonl.JoinJSONL( - testjsonl.CodexSessionMetaJSON("abc", "/tmp", "user", tsEarly), - testjsonl.CodexMsgJSON("user", "\n\t"+currentGoal, tsEarlyS1), - testjsonl.CodexMsgJSON("user", "Actual user message", "2024-01-01T10:00:02Z"), - testjsonl.CodexMsgJSON("assistant", "Assistant reply", "2024-01-01T10:00:03Z"), - testjsonl.CodexMsgJSON("user", " "+legacyGoal, "2024-01-01T10:00:04Z"), - testjsonl.CodexMsgJSON("user", "Follow-up user message", "2024-01-01T10:00:05Z"), - ) - sess, msgs := runCodexParserTest(t, "test.jsonl", content, false) - require.NotNil(t, sess) - require.Len(t, msgs, 3) - assert.Equal(t, "Actual user message", sess.FirstMessage) - assert.Equal(t, 2, sess.UserMessageCount) - assert.Equal(t, RoleUser, msgs[0].Role) - assert.Equal(t, "Actual user message", msgs[0].Content) - assert.Equal(t, RoleAssistant, msgs[1].Role) - assert.Equal(t, "Assistant reply", msgs[1].Content) - assert.Equal(t, RoleUser, msgs[2].Role) - assert.Equal(t, "Follow-up user message", msgs[2].Content) - }) - // Codex injects skill template content as role=user JSONL // entries when the model invokes a skill. These look like // follow-up user turns to a naive count, which inflates @@ -2170,41 +2142,6 @@ func TestParseCodexSessionFrom_SystemMessageDoesNotRequireFullParse(t *testing.T assert.False(t, endedAt.IsZero()) } -func TestParseCodexSessionFrom_GoalContextDoesNotRequireFullParse(t *testing.T) { - t.Parallel() - - initial := testjsonl.JoinJSONL( - testjsonl.CodexSessionMetaJSON("inc-goal", "/tmp", "codex_cli_rs", tsEarly), - testjsonl.CodexMsgJSON("user", "hello", tsEarlyS1), - ) - path := createTestFile(t, "codex-goal-inc.jsonl", initial) - - info, err := os.Stat(path) - require.NoError(t, err) - offset := info.Size() - - currentGoal := "\n" + - "Continue working toward the active thread goal.\n" + - "" - legacyGoal := "\n" + - "Continue working toward the active thread goal.\n" + - "" - - f, err := os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0o644) - require.NoError(t, err) - _, err = f.WriteString(testjsonl.JoinJSONL( - testjsonl.CodexMsgJSON("user", currentGoal, tsEarlyS5), - testjsonl.CodexMsgJSON("user", "\n "+legacyGoal, tsLate), - )) - require.NoError(t, err) - require.NoError(t, f.Close()) - - newMsgs, endedAt, _, err := ParseCodexSessionFrom(path, offset, 1, false) - require.NoError(t, err) - assert.Empty(t, newMsgs) - assert.False(t, endedAt.IsZero()) -} - func TestParseCodexSessionFrom_RunningNotificationRequiresFullParse(t *testing.T) { t.Parallel() From c54382cb31b71a78403809d9d21c645e75c46124 Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Fri, 26 Jun 2026 14:17:31 -0700 Subject: [PATCH 3/5] fix(export): preserve non-goal system rows --- internal/db/search.go | 18 +++++++++++++ internal/db/search_test.go | 24 +++++++++++++++++ internal/server/export.go | 4 +-- internal/server/export_test.go | 47 ++++++++++++++++++++++++++++++++++ 4 files changed, 91 insertions(+), 2 deletions(-) diff --git a/internal/db/search.go b/internal/db/search.go index 8b6803519..160c7923a 100644 --- a/internal/db/search.go +++ b/internal/db/search.go @@ -29,6 +29,24 @@ var SystemMsgPrefixes = []string{ ``, } +// IsGoalContextPrefixed reports whether a user-role message is a legacy +// Codex /goal continuation wrapper that may already be stored in older +// archives or read-only stores. +func IsGoalContextPrefixed(content, role string) bool { + if role != "user" { + return false + } + trimmed := strings.TrimLeft(content, systemPrefixTrimCutset) + if strings.HasPrefix(trimmed, "") { + return true + } + if strings.HasPrefix(trimmed, "") + return ok && strings.Contains(openTag, `source="goal"`) + } + return false +} + // SystemPrefixSQL returns a SQL clause that excludes user messages // matching any system prefix. The column alias for content must be // passed (e.g. "m.content" or "m2.content"). Uses case-sensitive diff --git a/internal/db/search_test.go b/internal/db/search_test.go index 2a1f40ce8..e406824ce 100644 --- a/internal/db/search_test.go +++ b/internal/db/search_test.go @@ -35,6 +35,30 @@ func TestIsSystemPrefixed(t *testing.T) { } } +func TestIsGoalContextPrefixed(t *testing.T) { + t.Parallel() + cases := []struct { + name string + content string + role string + want bool + }{ + {"legacy wrapper", "state", "user", true}, + {"legacy wrapper with whitespace", "\n\tstate", "user", true}, + {"current wrapper", `state`, "user", true}, + {"current wrapper with extra attrs", `state`, "user", true}, + {"non goal internal context", `state`, "user", false}, + {"non goal prefix", "This session is being continued from a previous run", "user", false}, + {"assistant role ignored", "state", "assistant", false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tc.want, IsGoalContextPrefixed(tc.content, tc.role)) + }) + } +} + func TestSearch(t *testing.T) { d := testDB(t) requireFTS(t, d) diff --git a/internal/server/export.go b/internal/server/export.go index ee3352398..91405b2a0 100644 --- a/internal/server/export.go +++ b/internal/server/export.go @@ -675,7 +675,7 @@ func generateExportHTML( func filterExportHTMLMessages(msgs []db.Message) []db.Message { filtered := make([]db.Message, 0, len(msgs)) for _, m := range msgs { - if db.IsSystemPrefixed(m.Content, m.Role) { + if db.IsGoalContextPrefixed(m.Content, m.Role) { continue } filtered = append(filtered, m) @@ -765,7 +765,7 @@ func focusedExportOrdinals(msgs []db.Message) map[int]bool { continue } - if m.IsSystem || db.IsSystemPrefixed(m.Content, m.Role) || + if m.IsSystem || db.IsGoalContextPrefixed(m.Content, m.Role) || isThinkingOnly(m.Content) { continue } diff --git a/internal/server/export_test.go b/internal/server/export_test.go index fbfbe44d9..9b515d512 100644 --- a/internal/server/export_test.go +++ b/internal/server/export_test.go @@ -550,6 +550,39 @@ func TestGenerateExportHTML_OmitsGoalContextRows(t *testing.T) { }) } +func TestGenerateExportHTML_PreservesNonGoalSystemPrefixedRows(t *testing.T) { + t.Parallel() + session := testSession(func(s *db.Session) { + s.MessageCount = 3 + }) + msgs := []db.Message{ + { + SessionID: "test-id", Ordinal: 0, + Role: "user", Content: "This session is being continued from a previous conversation.", + Timestamp: "2025-01-15T10:00:00Z", + }, + { + SessionID: "test-id", Ordinal: 1, + Role: "user", Content: "done", + Timestamp: "2025-01-15T10:00:01Z", + }, + { + SessionID: "test-id", Ordinal: 2, + Role: "user", Content: "Stop hook feedback: blocked", + Timestamp: "2025-01-15T10:00:02Z", + }, + } + + html := generateExportHTML(session, msgs) + + assertContainsAll(t, html, []string{ + "3 messages", + "This session is being continued from a previous conversation.", + "<task-notification>done</task-notification>", + "Stop hook feedback: blocked", + }) +} + func TestFocusedExportOrdinals(t *testing.T) { t.Parallel() tests := []struct { @@ -656,6 +689,20 @@ func TestFocusedExportOrdinals(t *testing.T) { }, want: []int{0, 4}, }, + { + name: "keeps non-goal system-prefixed user rows", + msgs: []db.Message{ + exportUserMsg(0), + { + SessionID: "test-id", + Ordinal: 1, + Role: "user", + Content: "Stop hook feedback: blocked", + }, + exportAssistantMsg(2, "answer"), + }, + want: []int{0, 1, 2}, + }, { name: "keeps answer before compact boundary", msgs: []db.Message{ From 5826ee7eaa76ad7704ecbb31fac7a890c2cdcc68 Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Fri, 26 Jun 2026 16:54:08 -0700 Subject: [PATCH 4/5] fix(search): align Codex goal context matching --- frontend/src/lib/utils/messages.test.ts | 25 +++++ frontend/src/lib/utils/messages.ts | 26 ++++- internal/db/search.go | 132 ++++++++++++++++++++---- internal/db/search_test.go | 48 +++++++++ internal/duckdb/analytics_usage.go | 2 +- internal/duckdb/messages.go | 12 +-- internal/duckdb/store.go | 10 +- internal/postgres/activity.go | 2 +- internal/postgres/messages.go | 6 +- internal/postgres/search_content.go | 4 +- internal/postgres/trends.go | 2 +- internal/server/export_test.go | 4 +- 12 files changed, 222 insertions(+), 51 deletions(-) diff --git a/frontend/src/lib/utils/messages.test.ts b/frontend/src/lib/utils/messages.test.ts index ee4e4d5d6..38024d89b 100644 --- a/frontend/src/lib/utils/messages.test.ts +++ b/frontend/src/lib/utils/messages.test.ts @@ -63,6 +63,14 @@ describe("isSystemMessage", () => { "codex internal goal context", ' state', ], + [ + "codex internal goal context with attr before source", + 'state', + ], + [ + "codex internal goal context with attr after source", + 'state', + ], ])("detects prefix-based system message: %s", (_label, content) => { expect(isSystemMessage(msg({ content }))).toBe(true); }); @@ -73,6 +81,23 @@ describe("isSystemMessage", () => { ).toBe(false); }); + it.each([ + [ + "non-goal internal context", + 'state', + ], + [ + "data-source attribute", + 'state', + ], + [ + "missing closing tag delimiter", + ' { + expect(isSystemMessage(msg({ content }))).toBe(false); + }); + it.each([ "continuation", "resume", diff --git a/frontend/src/lib/utils/messages.ts b/frontend/src/lib/utils/messages.ts index 09aef855e..11c997fc6 100644 --- a/frontend/src/lib/utils/messages.ts +++ b/frontend/src/lib/utils/messages.ts @@ -8,10 +8,12 @@ const SYSTEM_MSG_PREFIXES = [ "", "", - '', ]; +const LEGACY_GOAL_CONTEXT_PREFIX = ""; +const CODEX_INTERNAL_CONTEXT_TAG_PREFIX = " trimmed.startsWith(p)); + return ( + isGoalContextMessage(trimmed) || + SYSTEM_MSG_PREFIXES.some((p) => trimmed.startsWith(p)) + ); +} + +function isGoalContextMessage(trimmedContent: string): boolean { + if (trimmedContent.startsWith(LEGACY_GOAL_CONTEXT_PREFIX)) { + return true; + } + if (!trimmedContent.startsWith(CODEX_INTERNAL_CONTEXT_TAG_PREFIX)) { + return false; + } + const tagEnd = trimmedContent.indexOf(">"); + if (tagEnd < 0) { + return false; + } + const openTag = trimmedContent.slice(0, tagEnd); + return GOAL_CONTEXT_SOURCE_ATTR_RE.test(openTag); } /** diff --git a/internal/db/search.go b/internal/db/search.go index 160c7923a..547d16530 100644 --- a/internal/db/search.go +++ b/internal/db/search.go @@ -3,6 +3,7 @@ package db import ( "context" "fmt" + "regexp" "strings" ) @@ -12,10 +13,10 @@ const ( snippetTokenLength = 32 ) -// SystemMsgPrefixes lists content prefixes that identify system-injected -// user messages. These are excluded from search results even when the -// is_system column has not been backfilled (e.g. Claude sessions parsed -// before schema version 2). Keep in sync with the frontend list in +// SystemMsgPrefixes lists non-goal content prefixes that identify +// system-injected user messages. These are excluded from search results even +// when the is_system column has not been backfilled (e.g. Claude sessions +// parsed before schema version 2). Keep in sync with the frontend list in // frontend/src/lib/utils/messages.ts. var SystemMsgPrefixes = []string{ "This session is being continued", @@ -25,10 +26,18 @@ var SystemMsgPrefixes = []string{ "", "", - ``, } +const ( + legacyGoalContextPrefix = "" + codexInternalContextTagPrefix = "") { + if strings.HasPrefix(trimmed, legacyGoalContextPrefix) { return true } - if strings.HasPrefix(trimmed, "") - return ok && strings.Contains(openTag, `source="goal"`) + return ok && goalContextSourceAttrRe.MatchString(openTag) } return false } +type systemPrefixSQLDialect int + +const ( + systemPrefixSQLite systemPrefixSQLDialect = iota + systemPrefixPostgres + systemPrefixDuckDB +) + // SystemPrefixSQL returns a SQL clause that excludes user messages -// matching any system prefix. The column alias for content must be -// passed (e.g. "m.content" or "m2.content"). Uses case-sensitive -// substr comparison, which behaves identically on SQLite and -// PostgreSQL (unlike LIKE, which is case-insensitive on SQLite). +// matching any system prefix. The column alias for content must be passed +// (e.g. "m.content" or "m2.content"). Uses case-sensitive substr and +// position checks instead of LIKE, which is case-insensitive on SQLite. func SystemPrefixSQL(contentCol, roleCol string) string { + return systemPrefixSQL(contentCol, roleCol, systemPrefixSQLite) +} + +// PostgresSystemPrefixSQL is the PostgreSQL form of SystemPrefixSQL. +func PostgresSystemPrefixSQL(contentCol, roleCol string) string { + return systemPrefixSQL(contentCol, roleCol, systemPrefixPostgres) +} + +// DuckDBSystemPrefixSQL is the DuckDB form of SystemPrefixSQL. +func DuckDBSystemPrefixSQL(contentCol, roleCol string) string { + return systemPrefixSQL(contentCol, roleCol, systemPrefixDuckDB) +} + +func systemPrefixSQL( + contentCol, roleCol string, dialect systemPrefixSQLDialect, +) string { // LTRIM strips the same whitespace as Go's strings.TrimSpace, // JS .trim(), and the parser's isSystem helpers: ASCII whitespace, // BOM (U+FEFF), and Unicode // spaces (U+0085, U+00A0, U+1680, U+2000–U+200A, U+2028, - // U+2029, U+202F, U+205F, U+3000). Both SQLite and PostgreSQL + // U+2029, U+202F, U+205F, U+3000). SQLite, PostgreSQL, and DuckDB // handle multi-byte UTF-8 characters in the trim set correctly. - trimmed := "LTRIM(" + contentCol + ", ' \t\n\v\f\r" + - "\u0085\u00A0\u1680" + - "\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A" + - "\u2028\u2029\u202F\u205F\u3000\uFEFF')" - parts := make([]string, len(SystemMsgPrefixes)) - for i, p := range SystemMsgPrefixes { - parts[i] = fmt.Sprintf( + trimmed := systemPrefixSQLTrimmed(contentCol) + parts := make([]string, 0, len(SystemMsgPrefixes)+1) + for _, p := range SystemMsgPrefixes { + parts = append(parts, fmt.Sprintf( "substr(%s, 1, %d) = '%s'", trimmed, len(p), p, - ) + )) } + parts = append(parts, goalContextPrefixSQL(trimmed, dialect)) return "NOT (" + roleCol + " = 'user' AND (" + strings.Join(parts, " OR ") + "))" } +func systemPrefixSQLTrimmed(contentCol string) string { + return "LTRIM(" + contentCol + ", ' \t\n\v\f\r" + + "\u0085\u00A0\u1680" + + "\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A" + + "\u2028\u2029\u202F\u205F\u3000\uFEFF')" +} + +func goalContextPrefixSQL(trimmed string, dialect systemPrefixSQLDialect) string { + legacy := fmt.Sprintf("substr(%s, 1, %d) = '%s'", + trimmed, len(legacyGoalContextPrefix), legacyGoalContextPrefix) + current := fmt.Sprintf( + "(substr(%[1]s, 1, %[2]d) = '%[3]s' AND %[4]s)", + trimmed, len(codexInternalContextTagPrefix), + codexInternalContextTagPrefix, + goalContextSourceAttrSQL(openingTagSQL(trimmed, dialect), dialect), + ) + return "(" + legacy + " OR " + current + ")" +} + +func openingTagSQL(trimmed string, dialect systemPrefixSQLDialect) string { + return fmt.Sprintf("substr(%s, 1, %s)", + trimmed, sqlPosition(dialect, ">", trimmed)) +} + +func goalContextSourceAttrSQL( + openTag string, dialect systemPrefixSQLDialect, +) string { + normalized := openTag + for _, ws := range []string{"\t", "\n", "\v", "\f", "\r"} { + normalized = fmt.Sprintf("replace(%s, '%s', ' ')", normalized, ws) + } + checks := []string{ + sqlContains(dialect, normalized, goalContextSourceAttrSQLPrefix+" "), + sqlContains(dialect, normalized, goalContextSourceAttrSQLPrefix+">"), + sqlContains(dialect, normalized, goalContextSourceAttrSQLPrefix+"/>"), + } + return "(" + strings.Join(checks, " OR ") + ")" +} + +func sqlContains( + dialect systemPrefixSQLDialect, haystack, needle string, +) string { + return sqlPosition(dialect, needle, haystack) + " > 0" +} + +func sqlPosition( + dialect systemPrefixSQLDialect, needle, haystack string, +) string { + quotedNeedle := "'" + needle + "'" + if dialect == systemPrefixPostgres { + return fmt.Sprintf("POSITION(%s IN %s)", quotedNeedle, haystack) + } + return fmt.Sprintf("instr(%s, %s)", haystack, quotedNeedle) +} + // systemPrefixTrimCutset is the leading-whitespace set SystemPrefixSQL's // LTRIM strips: ASCII whitespace, BOM, and the Unicode spaces. Kept // identical so the Go and SQL system-prefix checks agree. @@ -90,6 +175,9 @@ func IsSystemPrefixed(content, role string) bool { if role != "user" { return false } + if IsGoalContextPrefixed(content, role) { + return true + } trimmed := strings.TrimLeft(content, systemPrefixTrimCutset) for _, p := range SystemMsgPrefixes { if strings.HasPrefix(trimmed, p) { diff --git a/internal/db/search_test.go b/internal/db/search_test.go index e406824ce..a1f965466 100644 --- a/internal/db/search_test.go +++ b/internal/db/search_test.go @@ -24,6 +24,10 @@ func TestIsSystemPrefixed(t *testing.T) { {"bom then prefix", "\uFEFFx", "user", true}, {"legacy goal context prefix", "\n\tstate", "user", true}, {"codex internal goal context prefix", `state`, "user", true}, + {"codex goal context with attr before source", `state`, "user", true}, + {"codex goal context with attr after source", `state`, "user", true}, + {"codex non-goal internal context", `state`, "user", false}, + {"codex data-source attr is not goal", `state`, "user", false}, {"assistant role is never system-prefixed", SystemMsgPrefixes[0], "assistant", false}, {"prefix mid-content does not match", "see later", "user", false}, } @@ -47,7 +51,11 @@ func TestIsGoalContextPrefixed(t *testing.T) { {"legacy wrapper with whitespace", "\n\tstate", "user", true}, {"current wrapper", `state`, "user", true}, {"current wrapper with extra attrs", `state`, "user", true}, + {"current wrapper with attrs after source", `state`, "user", true}, + {"current wrapper with newline before source", "state", "user", true}, {"non goal internal context", `state`, "user", false}, + {"data-source attr is not goal", `state`, "user", false}, + {"missing closing tag delimiter", `state", "assistant", false}, } @@ -59,6 +67,46 @@ func TestIsGoalContextPrefixed(t *testing.T) { } } +func TestSystemPrefixSQL(t *testing.T) { + d := testDB(t) + rows, err := d.getReader().QueryContext(context.Background(), ` + WITH candidates(label, role, content) AS ( + VALUES + ('normal', 'user', 'regular message'), + ('assistant-goal', 'assistant', 'state'), + ('legacy-goal', 'user', 'state'), + ('current-goal', 'user', 'state'), + ('attr-before-source', 'user', 'state'), + ('attr-after-source', 'user', 'state'), + ('newline-before-source', 'user', 'state'), + ('self-closing-goal', 'user', 'state'), + ('non-goal-internal', 'user', 'state'), + ('data-source', 'user', 'state'), + ('missing-close', 'user', '\n" + + currentGoal := "\n" + "Continue working toward the active thread goal.\n" + "" legacyGoal := "\n" + @@ -677,7 +677,7 @@ func TestFocusedExportOrdinals(t *testing.T) { SessionID: "test-id", Ordinal: 2, Role: "user", - Content: `state`, + Content: `state`, }, { SessionID: "test-id", From 4b8d97b39a30930523f2a745f1420519eddd7d58 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 28 Jun 2026 09:19:48 -0500 Subject: [PATCH 5/5] fix(parser): preserve non-goal Codex internal contexts Codex goal-context parsing still matched source="goal" as a raw substring, so wrappers such as data-source="goal" were treated as synthetic goal continuations. A data-version reparse could therefore drop messages that the search and export filters correctly preserve. Match the source attribute with the same boundary semantics as the persisted-row matcher so only the real source="goal" attribute is suppressed. --- internal/parser/codex.go | 8 ++++- internal/parser/codex_parser_test.go | 47 ++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/internal/parser/codex.go b/internal/parser/codex.go index f4b872269..af763aada 100644 --- a/internal/parser/codex.go +++ b/internal/parser/codex.go @@ -8,6 +8,7 @@ import ( "io" "os" "path/filepath" + "regexp" "slices" "sort" "strconv" @@ -31,6 +32,11 @@ var errCodexIncrementalNeedsFullParse = errors.New( "codex incremental event requires full parse", ) +const codexGoalContextSourceAttr = `source="goal"` + +var codexGoalContextSourceAttrRe = regexp.MustCompile(`(?:^|\s)` + + regexp.QuoteMeta(codexGoalContextSourceAttr) + `(?:\s|/|$)`) + var codexSessionIndexCache = struct { mu sync.Mutex entries map[string]codexSessionIndexEntry @@ -1767,7 +1773,7 @@ func isCodexGoalContext(content string) bool { } if strings.HasPrefix(trimmed, "") - return ok && strings.Contains(openTag, `source="goal"`) + return ok && codexGoalContextSourceAttrRe.MatchString(openTag) } return false } diff --git a/internal/parser/codex_parser_test.go b/internal/parser/codex_parser_test.go index 8cb3da38c..cc2acf282 100644 --- a/internal/parser/codex_parser_test.go +++ b/internal/parser/codex_parser_test.go @@ -1437,6 +1437,10 @@ func TestParseCodexSession_EdgeCases(t *testing.T) { "The objective below is user-provided data." current := "\n" + goalBody + "\n" + attrBeforeSource := "\n" + + goalBody + "\n" + attrAfterSource := "\n" + + goalBody + "\n" legacy := "\n" + goalBody + "\n" content := testjsonl.JoinJSONL( testjsonl.CodexSessionMetaJSON("abc", "/tmp", "user", tsEarly), @@ -1444,8 +1448,10 @@ func TestParseCodexSession_EdgeCases(t *testing.T) { testjsonl.CodexMsgJSON("assistant", "Working on it", "2024-01-01T10:00:02Z"), testjsonl.CodexMsgJSON("user", current, "2024-01-01T10:00:03Z"), testjsonl.CodexMsgJSON("assistant", "Still working", "2024-01-01T10:00:04Z"), - testjsonl.CodexMsgJSON("user", legacy, "2024-01-01T10:00:05Z"), - testjsonl.CodexMsgJSON("user", "Real second request", "2024-01-01T10:00:06Z"), + testjsonl.CodexMsgJSON("user", attrBeforeSource, "2024-01-01T10:00:05Z"), + testjsonl.CodexMsgJSON("user", attrAfterSource, "2024-01-01T10:00:06Z"), + testjsonl.CodexMsgJSON("user", legacy, "2024-01-01T10:00:07Z"), + testjsonl.CodexMsgJSON("user", "Real second request", "2024-01-01T10:00:08Z"), ) sess, msgs := runCodexParserTest(t, "test.jsonl", content, false) require.NotNil(t, sess) @@ -1458,6 +1464,43 @@ func TestParseCodexSession_EdgeCases(t *testing.T) { "goal continuation context must not count as user turns") }) + t.Run("keeps non-goal codex internal contexts", func(t *testing.T) { + cases := []struct { + name string + content string + }{ + { + name: "data source goal", + content: "\n" + + "Preserve this internal context.\n", + }, + { + name: "other source", + content: "\n" + + "Preserve this internal context.\n", + }, + { + name: "no source", + content: "\n" + + "Preserve this internal context.\n", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + content := testjsonl.JoinJSONL( + testjsonl.CodexSessionMetaJSON("abc", "/tmp", "user", tsEarly), + testjsonl.CodexMsgJSON("user", tc.content, tsEarlyS1), + ) + sess, msgs := runCodexParserTest(t, "test.jsonl", content, false) + require.NotNil(t, sess) + require.Len(t, msgs, 1) + assert.Equal(t, tc.content, msgs[0].Content) + assert.Equal(t, 1, sess.UserMessageCount, + "non-goal internal contexts must count as user turns") + }) + } + }) + // Only the structured goal wrapper is system content; a real user // message that merely quotes the goal sentence stays in the transcript. t.Run("keeps unwrapped goal-like user text", func(t *testing.T) {