Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions frontend/src/lib/utils/messages.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,19 @@ describe("isSystemMessage", () => {
["command-name", "<command-name>/commit</command-name>"],
["local-command", "<local-command-output>ok</local-command-output>"],
["stop hook", "Stop hook feedback: blocked"],
["legacy goal context", "\n\t<goal_context>state</goal_context>"],
[
"codex internal goal context",
' <codex_internal_context source="goal">state',
],
[
"codex internal goal context with attr before source",
'<codex_internal_context foo="bar" source="goal">state',
],
[
"codex internal goal context with attr after source",
'<codex_internal_context source="goal" foo="bar">state',
],
])("detects prefix-based system message: %s", (_label, content) => {
expect(isSystemMessage(msg({ content }))).toBe(true);
});
Expand All @@ -68,6 +81,23 @@ describe("isSystemMessage", () => {
).toBe(false);
});

it.each([
[
"non-goal internal context",
'<codex_internal_context source="other">state',
],
[
"data-source attribute",
'<codex_internal_context data-source="goal">state',
],
[
"missing closing tag delimiter",
'<codex_internal_context source="goal" state',
],
])("does not detect non-goal codex context: %s", (_label, content) => {
expect(isSystemMessage(msg({ content }))).toBe(false);
});

it.each([
"continuation",
"resume",
Expand Down
24 changes: 23 additions & 1 deletion frontend/src/lib/utils/messages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ const SYSTEM_MSG_PREFIXES = [
"Stop hook feedback:",
];

const LEGACY_GOAL_CONTEXT_PREFIX = "<goal_context>";
const CODEX_INTERNAL_CONTEXT_TAG_PREFIX = "<codex_internal_context";
const GOAL_CONTEXT_SOURCE_ATTR_RE = /(?:^|\s)source="goal"(?:\s|\/|$)/;

// Subtypes the Claude parser promotes into visible system messages
// that the SPA renders via SystemBoundaryCard. These must pass
// through the MessageList filter even though is_system=true.
Expand Down Expand Up @@ -39,7 +43,25 @@ export function isSystemMessage(m: Message): boolean {
if (m.is_system) return true;
if (m.role !== "user") return false;
const trimmed = m.content.trim();
return SYSTEM_MSG_PREFIXES.some((p) => trimmed.startsWith(p));
return (
isGoalContextMessage(trimmed) ||
SYSTEM_MSG_PREFIXES.some((p) => trimmed.startsWith(p))
);
}

function isGoalContextMessage(trimmedContent: string): boolean {
if (trimmedContent.startsWith(LEGACY_GOAL_CONTEXT_PREFIX)) {
return true;
}
if (!trimmedContent.startsWith(CODEX_INTERNAL_CONTEXT_TAG_PREFIX)) {
return false;
}
const tagEnd = trimmedContent.indexOf(">");
if (tagEnd < 0) {
return false;
}
const openTag = trimmedContent.slice(0, tagEnd);
return GOAL_CONTEXT_SOURCE_ATTR_RE.test(openTag);
}

/**
Expand Down
5 changes: 4 additions & 1 deletion internal/db/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,9 @@ import (
// backfill. Re-parsing persists estimated usage events for existing
// aggregate-only Kimi sessions and preserves explicit native event
// model names instead of the proxy fallback.)
// (56: Codex goal-continuation context wrappers are filtered from
// persisted messages and user_message_count. Existing Codex rows need
// re-parsing so synthetic /goal continuation records are removed.)
// (54: Antigravity .db sessions record a schema-fingerprint
// source_version. Re-parsing populates source_version on existing
// Antigravity IDE and CLI rows so "which agy release produced this
Expand All @@ -258,7 +261,7 @@ import (
// (51: Gemini cumulative-to-delta token reparse.)
// (17: Codex <skill> template filtering.)
// (16: <turn_aborted> system messages.)
const dataVersion = 55
const dataVersion = 56

const tokenCoverageRepairStatsKey = "token_coverage_repair_v1"

Expand Down
7 changes: 6 additions & 1 deletion internal/db/db_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -696,10 +696,15 @@ func TestMigration_ToolResultEventsTable(t *testing.T) {
}

func TestCurrentDataVersionKimiUsageEvents(t *testing.T) {
assert.Equal(t, 55, CurrentDataVersion(),
assert.GreaterOrEqual(t, CurrentDataVersion(), 55,
"Kimi persisted usage events require a data version bump")
}

func TestCurrentDataVersionGoalContextFiltering(t *testing.T) {
assert.Equal(t, 56, CurrentDataVersion(),
"Codex goal-context filtering requires a data version bump")
}

func TestInsertMessages_PreservesToolResultEvents(t *testing.T) {
d := testDB(t)
insertSession(t, d, "s-events", "proj")
Expand Down
142 changes: 125 additions & 17 deletions internal/db/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package db
import (
"context"
"fmt"
"regexp"
"strings"
)

Expand All @@ -12,10 +13,10 @@ const (
snippetTokenLength = 32
)

// SystemMsgPrefixes lists content prefixes that identify system-injected
// user messages. These are excluded from search results even when the
// is_system column has not been backfilled (e.g. Claude sessions parsed
// before schema version 2). Keep in sync with the frontend list in
// SystemMsgPrefixes lists non-goal content prefixes that identify
// system-injected user messages. These are excluded from search results even
// when the is_system column has not been backfilled (e.g. Claude sessions
// parsed before schema version 2). Keep in sync with the frontend list in
// frontend/src/lib/utils/messages.ts.
var SystemMsgPrefixes = []string{
"This session is being continued",
Expand All @@ -27,32 +28,136 @@ var SystemMsgPrefixes = []string{
"Stop hook feedback:",
}

const (
legacyGoalContextPrefix = "<goal_context>"
codexInternalContextTagPrefix = "<codex_internal_context"
goalContextSourceAttr = `source="goal"`
goalContextSourceAttrSQLPrefix = ` source="goal"`
)

var goalContextSourceAttrRe = regexp.MustCompile(`(?:^|\s)` +
regexp.QuoteMeta(goalContextSourceAttr) + `(?:\s|/|$)`)

// IsGoalContextPrefixed reports whether a user-role message is a legacy
// Codex /goal continuation wrapper that may already be stored in older
// archives or read-only stores.
func IsGoalContextPrefixed(content, role string) bool {
if role != "user" {
return false
}
trimmed := strings.TrimLeft(content, systemPrefixTrimCutset)
if strings.HasPrefix(trimmed, legacyGoalContextPrefix) {
return true
}
if strings.HasPrefix(trimmed, codexInternalContextTagPrefix) {
openTag, _, ok := strings.Cut(trimmed, ">")
return ok && goalContextSourceAttrRe.MatchString(openTag)
}
return false
}

type systemPrefixSQLDialect int

const (
systemPrefixSQLite systemPrefixSQLDialect = iota
systemPrefixPostgres
systemPrefixDuckDB
)

// SystemPrefixSQL returns a SQL clause that excludes user messages
// matching any system prefix. The column alias for content must be
// passed (e.g. "m.content" or "m2.content"). Uses case-sensitive
// substr comparison, which behaves identically on SQLite and
// PostgreSQL (unlike LIKE, which is case-insensitive on SQLite).
// matching any system prefix. The column alias for content must be passed
// (e.g. "m.content" or "m2.content"). Uses case-sensitive substr and
// position checks instead of LIKE, which is case-insensitive on SQLite.
func SystemPrefixSQL(contentCol, roleCol string) string {
return systemPrefixSQL(contentCol, roleCol, systemPrefixSQLite)
}

// PostgresSystemPrefixSQL is the PostgreSQL form of SystemPrefixSQL.
func PostgresSystemPrefixSQL(contentCol, roleCol string) string {
return systemPrefixSQL(contentCol, roleCol, systemPrefixPostgres)
}

// DuckDBSystemPrefixSQL is the DuckDB form of SystemPrefixSQL.
func DuckDBSystemPrefixSQL(contentCol, roleCol string) string {
return systemPrefixSQL(contentCol, roleCol, systemPrefixDuckDB)
}

func systemPrefixSQL(
contentCol, roleCol string, dialect systemPrefixSQLDialect,
) string {
// LTRIM strips the same whitespace as Go's strings.TrimSpace,
// JS .trim(), and the parser's isSystem helpers: ASCII whitespace,
// BOM (U+FEFF), and Unicode
// spaces (U+0085, U+00A0, U+1680, U+2000–U+200A, U+2028,
// U+2029, U+202F, U+205F, U+3000). Both SQLite and PostgreSQL
// U+2029, U+202F, U+205F, U+3000). SQLite, PostgreSQL, and DuckDB
// handle multi-byte UTF-8 characters in the trim set correctly.
trimmed := "LTRIM(" + contentCol + ", ' \t\n\v\f\r" +
"\u0085\u00A0\u1680" +
"\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A" +
"\u2028\u2029\u202F\u205F\u3000\uFEFF')"
parts := make([]string, len(SystemMsgPrefixes))
for i, p := range SystemMsgPrefixes {
parts[i] = fmt.Sprintf(
trimmed := systemPrefixSQLTrimmed(contentCol)
parts := make([]string, 0, len(SystemMsgPrefixes)+1)
for _, p := range SystemMsgPrefixes {
parts = append(parts, fmt.Sprintf(
"substr(%s, 1, %d) = '%s'", trimmed, len(p), p,
)
))
}
parts = append(parts, goalContextPrefixSQL(trimmed, dialect))
return "NOT (" + roleCol + " = 'user' AND (" +
strings.Join(parts, " OR ") + "))"
}

func systemPrefixSQLTrimmed(contentCol string) string {
return "LTRIM(" + contentCol + ", ' \t\n\v\f\r" +
"\u0085\u00A0\u1680" +
"\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A" +
"\u2028\u2029\u202F\u205F\u3000\uFEFF')"
}

func goalContextPrefixSQL(trimmed string, dialect systemPrefixSQLDialect) string {
legacy := fmt.Sprintf("substr(%s, 1, %d) = '%s'",
trimmed, len(legacyGoalContextPrefix), legacyGoalContextPrefix)
current := fmt.Sprintf(
"(substr(%[1]s, 1, %[2]d) = '%[3]s' AND %[4]s)",
trimmed, len(codexInternalContextTagPrefix),
codexInternalContextTagPrefix,
goalContextSourceAttrSQL(openingTagSQL(trimmed, dialect), dialect),
)
return "(" + legacy + " OR " + current + ")"
}

func openingTagSQL(trimmed string, dialect systemPrefixSQLDialect) string {
return fmt.Sprintf("substr(%s, 1, %s)",
trimmed, sqlPosition(dialect, ">", trimmed))
}

func goalContextSourceAttrSQL(
openTag string, dialect systemPrefixSQLDialect,
) string {
normalized := openTag
for _, ws := range []string{"\t", "\n", "\v", "\f", "\r"} {
normalized = fmt.Sprintf("replace(%s, '%s', ' ')", normalized, ws)
}
checks := []string{
sqlContains(dialect, normalized, goalContextSourceAttrSQLPrefix+" "),
sqlContains(dialect, normalized, goalContextSourceAttrSQLPrefix+">"),
sqlContains(dialect, normalized, goalContextSourceAttrSQLPrefix+"/>"),
}
return "(" + strings.Join(checks, " OR ") + ")"
}

func sqlContains(
dialect systemPrefixSQLDialect, haystack, needle string,
) string {
return sqlPosition(dialect, needle, haystack) + " > 0"
}

func sqlPosition(
dialect systemPrefixSQLDialect, needle, haystack string,
) string {
quotedNeedle := "'" + needle + "'"
if dialect == systemPrefixPostgres {
return fmt.Sprintf("POSITION(%s IN %s)", quotedNeedle, haystack)
}
return fmt.Sprintf("instr(%s, %s)", haystack, quotedNeedle)
}

// systemPrefixTrimCutset is the leading-whitespace set SystemPrefixSQL's
// LTRIM strips: ASCII whitespace, BOM, and the Unicode spaces. Kept
// identical so the Go and SQL system-prefix checks agree.
Expand All @@ -70,6 +175,9 @@ func IsSystemPrefixed(content, role string) bool {
if role != "user" {
return false
}
if IsGoalContextPrefixed(content, role) {
return true
}
trimmed := strings.TrimLeft(content, systemPrefixTrimCutset)
for _, p := range SystemMsgPrefixes {
if strings.HasPrefix(trimmed, p) {
Expand Down
74 changes: 74 additions & 0 deletions internal/db/search_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ func TestIsSystemPrefixed(t *testing.T) {
{"task-notification prefix", "<task-notification>done</task-notification>", "user", true},
{"leading whitespace then prefix", "\n\t <command-name>/foo", "user", true},
{"bom then prefix", "\uFEFF<command-message>x", "user", true},
{"legacy goal context prefix", "\n\t<goal_context>state</goal_context>", "user", true},
{"codex internal goal context prefix", `<codex_internal_context source="goal">state`, "user", true},
{"codex goal context with attr before source", `<codex_internal_context foo="bar" source="goal">state`, "user", true},
{"codex goal context with attr after source", `<codex_internal_context source="goal" foo="bar">state`, "user", true},
{"codex non-goal internal context", `<codex_internal_context source="other">state`, "user", false},
{"codex data-source attr is not goal", `<codex_internal_context data-source="goal">state`, "user", false},
{"assistant role is never system-prefixed", SystemMsgPrefixes[0], "assistant", false},
{"prefix mid-content does not match", "see <task-notification> later", "user", false},
}
Expand All @@ -33,6 +39,74 @@ func TestIsSystemPrefixed(t *testing.T) {
}
}

func TestIsGoalContextPrefixed(t *testing.T) {
t.Parallel()
cases := []struct {
name string
content string
role string
want bool
}{
{"legacy wrapper", "<goal_context>state</goal_context>", "user", true},
{"legacy wrapper with whitespace", "\n\t<goal_context>state", "user", true},
{"current wrapper", `<codex_internal_context source="goal">state`, "user", true},
{"current wrapper with extra attrs", `<codex_internal_context foo="bar" source="goal">state`, "user", true},
{"current wrapper with attrs after source", `<codex_internal_context source="goal" foo="bar">state`, "user", true},
{"current wrapper with newline before source", "<codex_internal_context\nsource=\"goal\">state", "user", true},
{"non goal internal context", `<codex_internal_context source="other">state`, "user", false},
{"data-source attr is not goal", `<codex_internal_context data-source="goal">state`, "user", false},
{"missing closing tag delimiter", `<codex_internal_context source="goal" state`, "user", false},
{"non goal prefix", "This session is being continued from a previous run", "user", false},
{"assistant role ignored", "<goal_context>state</goal_context>", "assistant", false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
assert.Equal(t, tc.want, IsGoalContextPrefixed(tc.content, tc.role))
})
}
}

func TestSystemPrefixSQL(t *testing.T) {
d := testDB(t)
rows, err := d.getReader().QueryContext(context.Background(), `
WITH candidates(label, role, content) AS (
VALUES
('normal', 'user', 'regular message'),
('assistant-goal', 'assistant', '<codex_internal_context source="goal">state'),
('legacy-goal', 'user', '<goal_context>state</goal_context>'),
('current-goal', 'user', '<codex_internal_context source="goal">state'),
('attr-before-source', 'user', '<codex_internal_context foo="bar" source="goal">state'),
('attr-after-source', 'user', '<codex_internal_context source="goal" foo="bar">state'),
('newline-before-source', 'user', '<codex_internal_context
source="goal">state'),
('self-closing-goal', 'user', '<codex_internal_context source="goal"/>state'),
('non-goal-internal', 'user', '<codex_internal_context source="other">state'),
('data-source', 'user', '<codex_internal_context data-source="goal">state'),
('missing-close', 'user', '<codex_internal_context source="goal" state')
)
SELECT label FROM candidates
WHERE `+SystemPrefixSQL("content", "role")+`
ORDER BY label`)
require.NoError(t, err)
defer rows.Close()

var got []string
for rows.Next() {
var label string
require.NoError(t, rows.Scan(&label))
got = append(got, label)
}
require.NoError(t, rows.Err())
assert.Equal(t, []string{
"assistant-goal",
"data-source",
"missing-close",
"non-goal-internal",
"normal",
}, got)
}

func TestSearch(t *testing.T) {
d := testDB(t)
requireFTS(t, d)
Expand Down
2 changes: 1 addition & 1 deletion internal/duckdb/analytics_usage.go
Original file line number Diff line number Diff line change
Expand Up @@ -1954,7 +1954,7 @@ func (s *Store) GetTrendsTerms(
WHERE s.deleted_at IS NULL
AND m.role IN ('user', 'assistant')
AND m.is_system = FALSE
AND `+db.SystemPrefixSQL("m.content", "m.role"))
AND `+db.DuckDBSystemPrefixSQL("m.content", "m.role"))
if err != nil {
return db.TrendsTermsResponse{}, err
}
Expand Down
Loading