From 4955ca7560ffee3e4f3e31897f2515072fb1d110 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Barnab=C3=A1s=20Domozi?= Date: Mon, 8 Jun 2026 15:53:50 +0200 Subject: [PATCH] Share checker configurations across analysis runs via CheckerSet Introduce CheckerSet and CheckerSetItem tables to deduplicate checker configurations across analysis runs. Instead of storing enabled/disabled checkers per AnalysisInfo row, checkers are now grouped into reusable sets identified by a SHA-256 hash. This avoids redundant rows when multiple runs share the same checker configuration. --- .../codechecker_server/api/mass_store_run.py | 98 ++++--- .../codechecker_server/api/report_server.py | 22 +- .../database/run_db_model.py | 68 +++-- ...17_add_checkerset_checkersetitem_tables.py | 247 ++++++++++++++++++ 4 files changed, 373 insertions(+), 62 deletions(-) create mode 100644 web/server/codechecker_server/migrations/report/versions/cdfb6397dd17_add_checkerset_checkersetitem_tables.py diff --git a/web/server/codechecker_server/api/mass_store_run.py b/web/server/codechecker_server/api/mass_store_run.py index 64a319991d..a04ef63834 100644 --- a/web/server/codechecker_server/api/mass_store_run.py +++ b/web/server/codechecker_server/api/mass_store_run.py @@ -46,9 +46,9 @@ from ..database.config_db_model import Product from ..database.database import DBSession from ..database.run_db_model import \ - AnalysisInfo, AnalysisInfoChecker, AnalyzerStatistic, \ + AnalysisInfo, AnalyzerStatistic, \ BugPathEvent, BugReportPoint, \ - Checker, \ + Checker, CheckerSet, CheckerSetItem, \ ExtendedReportData, \ File, FileContent, \ Report as DBReport, ReportAnnotations, ReviewStatus as ReviewStatusRule, \ @@ -63,6 +63,8 @@ from ..task_executors.task_manager import TaskManager from .thrift_enum_helper import report_extended_data_type_str +from sqlalchemy.orm import Session as SA_Session + LOG = get_logger('server') @@ -1002,7 +1004,7 @@ def __store_analysis_statistics( def __store_analysis_info( self, - session: DBSession, + session: SA_Session, run_history: RunHistory ): """ Store analysis info for the given run history. """ @@ -1012,38 +1014,64 @@ def __store_analysis_info( analyzer_command.encode("utf-8"), zlib.Z_BEST_COMPRESSION) - analysis_info_rows = session \ - .query(AnalysisInfo) \ - .filter(AnalysisInfo.analyzer_command == cmd) \ - .all() - - if analysis_info_rows: - # It is possible when multiple runs are stored - # simultaneously to the server with the same analysis - # command that multiple entries are stored into the - # database. In this case we will select the first one. - analysis_info = analysis_info_rows[0] - else: - analysis_info = AnalysisInfo(analyzer_command=cmd) - - # Obtain the ID eagerly to be able to use the M-to-N table. - session.add(analysis_info) - session.flush() - session.refresh(analysis_info, ["id"]) - - for analyzer in mip.analyzers: - q = session \ - .query(Checker) \ - .filter(Checker.analyzer_name == analyzer) - db_checkers = {r.checker_name: r for r in q.all()} - - connection_rows = [AnalysisInfoChecker( - analysis_info, db_checkers[chk], is_enabled) - for chk, is_enabled - in mip.checkers.get(analyzer, {}).items()] - for r in connection_rows: - session.add(r) - + enabled_checkers: List[int] = [] + disabled_checkers: List[int] = [] + for analyzer in mip.analyzers: + q = session \ + .query(Checker) \ + .filter(Checker.analyzer_name == analyzer) + db_checkers = {r.checker_name: r for r in q.all()} + + for chk, is_enabled in \ + mip.checkers.get(analyzer, {}).items(): + if is_enabled: + enabled_checkers.append(db_checkers[chk].id) + else: + disabled_checkers.append(db_checkers[chk].id) + + # Check if the CheckerSet was already used before. + checker_set_hash = CheckerSet.compute_hash(enabled_checkers, + disabled_checkers) + checker_set = session.query(CheckerSet) \ + .filter(CheckerSet.hash_digest == checker_set_hash) \ + .first() + + # If not, create a new CheckerSet and insert to db + if not checker_set: + try: + with session.begin_nested(): + checker_set = CheckerSet(checker_set_hash) + # Obtain CheckerSet id eagerly. + session.add(checker_set) + session.flush() + session.refresh(checker_set, ["id"]) + LOG.info( + "[%s] Created new CheckerSet with hash '%s'", + self._name, checker_set_hash) + + # Insert checkers as elements of this newly + # created CheckerSet + for e in enabled_checkers: + session.add(CheckerSetItem(checker_set.id, + e, True)) + for d in disabled_checkers: + session.add(CheckerSetItem(checker_set.id, + d, False)) + except Exception: + # Meanwhile, another store operation already + # inserted this CheckerSet, query the existing + # one from db + checker_set = session.query(CheckerSet) \ + .filter( + CheckerSet.hash_digest == checker_set_hash) \ + .first() + + if not checker_set: + raise RuntimeError( + "Failed to query CheckerSet from database!") + + analysis_info = AnalysisInfo(analyzer_command=cmd, + checker_set_id=checker_set.id) run_history.analysis_info.append(analysis_info) self.__analysis_info[src_dir_path] = analysis_info diff --git a/web/server/codechecker_server/api/report_server.py b/web/server/codechecker_server/api/report_server.py index a6d126d1a1..196824a3ce 100644 --- a/web/server/codechecker_server/api/report_server.py +++ b/web/server/codechecker_server/api/report_server.py @@ -64,10 +64,10 @@ from ..database.config_db_model import Product from ..database.database import conv, DBSession, escape_like from ..database.run_db_model import \ - AnalysisInfo, AnalysisInfoChecker as DB_AnalysisInfoChecker, \ + AnalysisInfo, \ AnalyzerStatistic, \ BugPathEvent, BugReportPoint, \ - CleanupPlan, CleanupPlanReportHash, Checker, Comment, \ + CleanupPlan, CleanupPlanReportHash, Checker, CheckerSetItem, Comment, \ ExtendedReportData, \ File, FileContent, \ Report, ReportAnnotations, ReportAnalysisInfo, ReviewStatus, \ @@ -1949,11 +1949,11 @@ def getAnalysisInfo(self, analysis_info_filter, limit, offset): checkers_q = session \ .query(Checker.analyzer_name, Checker.checker_name, - DB_AnalysisInfoChecker.enabled) \ - .join(Checker, DB_AnalysisInfoChecker.checker_id == + CheckerSetItem.enabled) \ + .join(Checker, CheckerSetItem.checker_id == Checker.id) \ - .filter(DB_AnalysisInfoChecker. - analysis_info_id == cmd.id) + .filter(CheckerSetItem.checker_set_id == + cmd.checker_set_id) checkers: Dict[str, Dict[str, API_AnalysisInfoChecker]] = \ defaultdict(dict) @@ -3383,12 +3383,12 @@ def getCheckerStatusVerificationDetails(self, run_ids, report_filter): ) .join(RunHistory) .join(AnalysisInfo, RunHistory.analysis_info) - .join(DB_AnalysisInfoChecker, ( - (AnalysisInfo.id == - DB_AnalysisInfoChecker.analysis_info_id) - & (DB_AnalysisInfoChecker.enabled.is_(True)))) + .join(CheckerSetItem, ( + (AnalysisInfo.checker_set_id == + CheckerSetItem.checker_set_id) + & (CheckerSetItem.enabled.is_(True)))) .join(Checker, - DB_AnalysisInfoChecker.checker_id == Checker.id) + CheckerSetItem.checker_id == Checker.id) .outerjoin(Report, ((Checker.id == Report.checker_id) & (Run.id == Report.run_id))) .filter(RunHistory.id == max_run_histories.subquery() diff --git a/web/server/codechecker_server/database/run_db_model.py b/web/server/codechecker_server/database/run_db_model.py index f240ae7826..9f27823a92 100644 --- a/web/server/codechecker_server/database/run_db_model.py +++ b/web/server/codechecker_server/database/run_db_model.py @@ -11,7 +11,9 @@ from datetime import datetime, timedelta from math import ceil import os -from typing import Optional +import json +import hashlib +from typing import Optional, List from sqlalchemy import Boolean, Column, DateTime, Enum, ForeignKey, Integer, \ LargeBinary, MetaData, String, UniqueConstraint, Table, Text, JSON @@ -53,29 +55,58 @@ def __init__(self, analyzer_name: str, checker_name: str, severity: int): self.severity = severity -class AnalysisInfoChecker(Base): - __tablename__ = "analysis_info_checkers" +class CheckerSet(Base): + __tablename__ = "checker_set" - analysis_info_id = Column(Integer, - ForeignKey("analysis_info.id", - deferrable=True, - initially="DEFERRED", - ondelete="CASCADE"), - primary_key=True) + id = Column(Integer, autoincrement=True, primary_key=True) + hash_digest = Column(String, unique=True, nullable=False) + + def __init__(self, hash_digest: str): + self.hash_digest = hash_digest + + # We compute a hash from the enabled_checkers and + # disabled_checkers lists to generate a unique identifier + # for each CheckerSet. + # The goal is to speed up report storage to the server: + # when a user stores results to the server, we first compute + # this hash and then check if it was already inserted to the database. + # If the hash exists in this table, that means the particular CheckerSet + # was already used before. + @staticmethod + def compute_hash(enabled_checkers: List[int], + disabled_checkers: List[int]) -> str: + # Sort lists to create identical hashes. + enabled_checkers.sort() + disabled_checkers.sort() + + checker_set_dict = {"e": enabled_checkers, "d": disabled_checkers} + return hashlib.sha256( + json.dumps(checker_set_dict).encode()).hexdigest() + + +class CheckerSetItem(Base): + __tablename__ = "checker_set_items" + + checker_set_id = Column(Integer, + ForeignKey("checker_set.id", + deferrable=True, + initially="DEFERRED", + ondelete="CASCADE"), + primary_key=True) checker_id = Column(Integer, ForeignKey("checkers.id", deferrable=True, initially="DEFERRED", ondelete="RESTRICT"), primary_key=True) - enabled = Column(Boolean) + enabled = Column(Boolean, nullable=False) def __init__(self, - analysis_info: "AnalysisInfo", - checker: Checker, + checker_set_id: int, + checker_id: int, is_enabled: bool): - self.analysis_info_id = analysis_info.id - self.checker_id = checker.id + self.checker_set_id = checker_set_id + self.checker_id = checker_id self.enabled = is_enabled @@ -84,10 +115,15 @@ class AnalysisInfo(Base): id = Column(Integer, autoincrement=True, primary_key=True) analyzer_command = Column(LargeBinary) - available_checkers = relationship(AnalysisInfoChecker, uselist=True) + checker_set_id = Column(Integer, + ForeignKey("checker_set.id", + deferrable=True, + initially="DEFERRED", + ondelete="CASCADE")) - def __init__(self, analyzer_command: bytes): + def __init__(self, analyzer_command: bytes, checker_set_id: int): self.analyzer_command = analyzer_command + self.checker_set_id = checker_set_id class Run(Base): diff --git a/web/server/codechecker_server/migrations/report/versions/cdfb6397dd17_add_checkerset_checkersetitem_tables.py b/web/server/codechecker_server/migrations/report/versions/cdfb6397dd17_add_checkerset_checkersetitem_tables.py new file mode 100644 index 0000000000..0a9b5d8311 --- /dev/null +++ b/web/server/codechecker_server/migrations/report/versions/cdfb6397dd17_add_checkerset_checkersetitem_tables.py @@ -0,0 +1,247 @@ +""" +Add CheckerSet, CheckerSetItem tables + +Revision ID: cdfb6397dd17 +Revises: 24c9660f82b1 +Create Date: 2026-06-08 16:56:50.458098 +""" + +from logging import getLogger + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +import json + +from codechecker_server.database.run_db_model import CheckerSet + +# Revision identifiers, used by Alembic. +revision = 'cdfb6397dd17' +down_revision = '24c9660f82b1' +branch_labels = None +depends_on = None + + +def upgrade(): + LOG = getLogger("migration/report") + dialect = op.get_context().dialect.name + conn = op.get_bind() + + op.create_table( + 'checker_set', + sa.Column('id', sa.Integer(), + autoincrement=True, nullable=False), + sa.Column('hash_digest', sa.String(), nullable=False), + sa.PrimaryKeyConstraint('id', name=op.f('pk_checker_set')), + sa.UniqueConstraint('hash_digest', + name=op.f('uq_checker_set_hash_digest')) + ) + op.create_table( + 'checker_set_items', + sa.Column('checker_set_id', sa.Integer(), + nullable=False), + sa.Column('checker_id', sa.Integer(), nullable=False), + sa.Column('enabled', sa.Boolean(), nullable=False), + sa.ForeignKeyConstraint( + ['checker_id'], ['checkers.id'], + name=op.f('fk_checker_set_items_checker_id_checkers'), + ondelete='RESTRICT', initially='DEFERRED', + deferrable=True), + sa.ForeignKeyConstraint( + ['checker_set_id'], ['checker_set.id'], + name=op.f('fk_checker_set_items_checker_set_id_checker_set'), + ondelete='CASCADE', initially='DEFERRED', deferrable=True), + sa.PrimaryKeyConstraint('checker_set_id', 'checker_id', + name=op.f('pk_checker_set_items')) + ) + op.add_column('analysis_info', sa.Column('checker_set_id', + sa.Integer(), nullable=True)) + + if dialect == "postgresql": + op.create_foreign_key(op.f( + 'fk_analysis_info_checker_set_id_checker_set'), + 'analysis_info', 'checker_set', ['checker_set_id'], + ['id'], ondelete='CASCADE', initially='DEFERRED', deferrable=True) + elif dialect == "sqlite": + op.execute("PRAGMA foreign_keys=OFF") + with op.batch_alter_table('analysis_info', schema=None) as batch_op: + batch_op.create_foreign_key( + batch_op.f('fk_analysis_info_checker_set_id_checker_set'), + 'checker_set', + ['checker_set_id'], ['id'], + ondelete='CASCADE', initially='DEFERRED', deferrable=True + ) + else: + raise Exception(f"Dialect {dialect} is not supported!") + + LOG.info("Aggregating all checkers from table " + f"analysis_info_checkers (dialect: {dialect}) ...") + checker_sets = {} + + if dialect == "postgresql": + # Note: For each CheckerSet, we want to generate a unique identifier, + # to speed up report storage to the server. + # CheckerSet.compute_hash() uses hashing algorithm SHA256 + # which is significantly slower than MD5. + # Therefore, we use Postgresql's built-in md5 hash function to + # calculate a hash that can be used to efficiently put checkers + # into groups. + query = \ + conn.execute( + sa.text( + """SELECT analysis_info_id, + enabled_checkers, + disabled_checkers, + md5(enabled_checkers::text || '-' + || disabled_checkers::text) AS hash + FROM + (SELECT analysis_info_id, + array_agg(checker_id ORDER by checker_id) + FILTER (WHERE enabled) AS enabled_checkers, + array_agg(checker_id ORDER by checker_id) + FILTER (WHERE NOT enabled) AS disabled_checkers + FROM analysis_info_checkers + GROUP BY analysis_info_id)""")) + for analysis_info_id, enabled_checkers, \ + disabled_checkers, md5sum in query: + enabled_checkers = enabled_checkers or [] + disabled_checkers = disabled_checkers or [] + + if md5sum not in checker_sets: + checker_sets[md5sum] = {} + checker_sets[md5sum]["hash_digest"] = \ + CheckerSet.compute_hash(enabled_checkers, + disabled_checkers) + checker_sets[md5sum]["enabled_checkers"] = enabled_checkers + checker_sets[md5sum]["disabled_checkers"] = disabled_checkers + checker_sets[md5sum]["analysis_info_ids"] = [analysis_info_id] + else: + checker_sets[md5sum]["analysis_info_ids"]. \ + append(analysis_info_id) + elif dialect == "sqlite": + # Note: SQLite does not support an md5 hash function, + # therefore we need to compute the hash every time + # in Python to determine the checker groups. + # + # Ordering in json_group_array is not needed in this case, + # since no md5sum is computed. + # Additionally, CheckerSet.compute_hash() always sorts checker + # lists for hash generation. + query = conn.execute( + sa.text("""SELECT analysis_info_id, + json_group_array(checker_id) + FILTER (WHERE enabled) AS enabled_checkers, + json_group_array(checker_id) + FILTER (WHERE NOT enabled) AS disabled_checkers + FROM analysis_info_checkers + GROUP BY analysis_info_id""")) + + for analysis_info_id, enabled_checkers, disabled_checkers in query: + enabled_checkers = json.loads(enabled_checkers) \ + if enabled_checkers != "[null]" else [] + disabled_checkers = json.loads(disabled_checkers) \ + if disabled_checkers != "[null]" else [] + hash_digest = CheckerSet.compute_hash(enabled_checkers, + disabled_checkers) + + if hash_digest not in checker_sets: + checker_sets[hash_digest] = {} + checker_sets[hash_digest]["hash_digest"] = \ + hash_digest + checker_sets[hash_digest]["enabled_checkers"] = \ + enabled_checkers + checker_sets[hash_digest]["disabled_checkers"] = \ + disabled_checkers + checker_sets[hash_digest]["analysis_info_ids"] = \ + [analysis_info_id] + else: + checker_sets[hash_digest]["analysis_info_ids"]. \ + append(analysis_info_id) + else: + raise Exception(f"Dialect {dialect} is not supported!") + + LOG.info("Inserting new CheckerSets to database ...") + for v in checker_sets.values(): + conn.execute( + sa.text("INSERT INTO checker_set (hash_digest) " + "VALUES (:hash_digest)"), + {"hash_digest": v["hash_digest"]} + ) + + # Obtain CheckerSet ID + select_q = conn.execute( + sa.text("SELECT id from checker_set " + "WHERE hash_digest = :hash_digest"), + {"hash_digest": v["hash_digest"]} + ).fetchone() + + if not select_q: + raise Exception("Failed to insert checker_set " + f"with hash_digest {v['hash_digest']}") + + # Bulk insert checkers + if v["enabled_checkers"]: + conn.execute( + sa.text("INSERT INTO checker_set_items " + "(checker_set_id, checker_id, enabled) " + "VALUES (:checker_set_id, :checker_id, :enabled)"), + [{"checker_set_id": select_q.id, + "checker_id": checker_id, "enabled": True} + for checker_id in v["enabled_checkers"]] + ) + + if v["disabled_checkers"]: + conn.execute( + sa.text("INSERT INTO checker_set_items " + "(checker_set_id, checker_id, enabled) " + "VALUES (:checker_set_id, :checker_id, :enabled)"), + [{"checker_set_id": select_q.id, + "checker_id": checker_id, "enabled": False} + for checker_id in v["disabled_checkers"]] + ) + + # Update AnalysisInfo table checker_set_id column + conn.execute( + sa.text( + "UPDATE analysis_info SET checker_set_id = :checker_set_id " + "WHERE id IN :ids").bindparams(sa.bindparam( + "ids", expanding=True)), + [{"checker_set_id": select_q.id, "ids": v["analysis_info_ids"]}] + ) + + LOG.info("Dropping table analysis_info_checkers ...") + op.drop_table('analysis_info_checkers') + # ### end Alembic commands ### + + +def downgrade(): + LOG = getLogger("migration/report") + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint( + op.f('fk_analysis_info_checker_set_id_checker_set'), + 'analysis_info', type_='foreignkey') + op.drop_column('analysis_info', 'checker_set_id') + op.create_table( + 'analysis_info_checkers', + sa.Column('analysis_info_id', sa.INTEGER(), autoincrement=False, + nullable=False), + sa.Column('checker_id', sa.INTEGER(), autoincrement=False, + nullable=False), + sa.Column('enabled', sa.BOOLEAN(), autoincrement=False, + nullable=True), + sa.ForeignKeyConstraint( + ['analysis_info_id'], ['analysis_info.id'], + name=op.f( + 'fk_analysis_info_checkers_analysis_info_id_analysis_info'), + ondelete='CASCADE', initially='DEFERRED', deferrable=True), + sa.ForeignKeyConstraint( + ['checker_id'], ['checkers.id'], + name=op.f( + 'fk_analysis_info_checkers_checker_id_checkers'), + ondelete='RESTRICT', initially='DEFERRED', deferrable=True), + sa.PrimaryKeyConstraint('analysis_info_id', 'checker_id', + name=op.f('pk_analysis_info_checkers')) + ) + op.drop_table('checker_set_items') + op.drop_table('checker_set') + # ### end Alembic commands ###