Skip to content
Merged
32 changes: 30 additions & 2 deletions services/apps/git_integration/src/crowdgit/database/crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ async def find_github_identity(github_username: str):
WHERE
platform = 'github'
AND value = $1
AND "verified" = TRUE
AND "deletedAt" is null
LIMIT 1
"""
Expand Down Expand Up @@ -405,18 +406,45 @@ async def update_maintainer_run(repo_id: str, maintainer_file: str):


async def get_maintainers_for_repo(repo_id: str):
# Active rows only (endDate IS NULL) — reappearing maintainers hit the "new"
# branch and get reactivated by upsert_maintainer's ON CONFLICT clause.
# verified=TRUE mirrors find_github_identity / find_maintainer_identity_by_email.
# platform/type are returned so the diff's safety guard can match identifiers
# by kind and avoid cross-platform value collisions (e.g. a GitHub username
# "foo" colliding with a same-named handle on another platform).
maintainers_sql_query = """
SELECT mi.role, mi."originalRole", mi."repoUrl", mi."repoId", mi."identityId", mem.value as github_username
SELECT mi.role, mi."originalRole", mi."repoUrl", mi."repoId", mi."identityId",
mem.value as identity_value, mem.platform, mem.type
FROM "maintainersInternal" mi
JOIN "memberIdentities" mem ON mi."identityId" = mem.id
WHERE mi."repoId" = $1 AND mem.platform = 'github' AND mem.type = 'username' and mem.verified = True AND mem."deletedAt" is null
WHERE mi."repoId" = $1
AND mi."endDate" IS NULL
AND mem."verified" = TRUE
AND mem."deletedAt" is null
"""
return await query(
maintainers_sql_query,
(repo_id,),
)


async def get_github_maintainer_usernames_for_repo(repo_id: str) -> set[str]:
"""Return GitHub usernames of active maintainers for fork/parent-repo filtering."""
sql_query = """
SELECT mem.value
FROM "maintainersInternal" mi
JOIN "memberIdentities" mem ON mi."identityId" = mem.id
WHERE mi."repoId" = $1
AND mi."endDate" IS NULL
AND mem.platform = 'github'
AND mem.type = 'username'
AND mem."verified" = TRUE
AND mem."deletedAt" is null
"""
rows = await query(sql_query, (repo_id,))
return {row["value"] for row in rows}


async def set_maintainer_end_date(
repo_id: str, identity_id: str, role: str, change_date: datetime
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from crowdgit.database.crud import (
find_github_identity,
find_maintainer_identity_by_email,
get_github_maintainer_usernames_for_repo,
get_maintainers_for_repo,
save_service_execution,
set_maintainer_end_date,
Expand Down Expand Up @@ -153,43 +154,58 @@ def make_role(self, title: str):
)
return slugify(title)

async def _resolve_identity(
self, github_username: str | None, email: str | None
) -> str | None:
# Fall back to email when github_username is missing/"unknown" — the AI
# extractor emits "unknown" for ~4k entries on the linux MAINTAINERS file.
if github_username and github_username != "unknown":
identity_id = await find_github_identity(github_username)
if identity_id:
return identity_id
if email and email != "unknown":
return await find_maintainer_identity_by_email(email)
return None

async def _resolve_maintainers(
self, maintainers: list[MaintainerInfoItem]
) -> list[tuple[MaintainerInfoItem, str]]:
# Shared by first-run and incremental paths so lookup semantics stay identical.
semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_CHUNKS)

async def resolve(m: MaintainerInfoItem) -> tuple[MaintainerInfoItem, str | None]:
async with semaphore:
identity_id = await self._resolve_identity(m.github_username, m.email)
return m, identity_id

results = await asyncio.gather(*[resolve(m) for m in maintainers])

resolved: list[tuple[MaintainerInfoItem, str]] = []
for m, identity_id in results:
if identity_id is None:
self.logger.warning(f"Identity not found for maintainer: {m}")
continue
Comment thread
joanagmaia marked this conversation as resolved.
resolved.append((m, identity_id))
return resolved

async def insert_new_maintainers(
self, repo_url: str, repo_id: str, maintainers: list[MaintainerInfoItem]
):
async def process_maintainer(maintainer: MaintainerInfoItem):
self.logger.info(f"Processing maintainer: {maintainer.github_username}")
role = maintainer.normalized_title
original_role = self.make_role(maintainer.title)
# Find the identity in the database
github_username = maintainer.github_username
email = maintainer.email

if github_username == "unknown" and email == "unknown":
self.logger.warning("username & email with value 'unknown' aborting")
return
identity_id = (
await find_github_identity(github_username)
if github_username != "unknown"
else await find_maintainer_identity_by_email(email)
)
self.logger.debug(
f"Found identity_id for {github_username}: {identity_id} (type: {type(identity_id)})"
)
if identity_id:
resolved = await self._resolve_maintainers(maintainers)
# Concurrent upserts: large MAINTAINERS files carry thousands of entries.
semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_CHUNKS)

async def upsert(maintainer: MaintainerInfoItem, identity_id: str) -> None:
async with semaphore:
role = maintainer.normalized_title
original_role = self.make_role(maintainer.title)
await upsert_maintainer(repo_id, identity_id, repo_url, role, original_role)
self.logger.info(
f"Successfully upserted maintainer {github_username} with identity_id {identity_id}"
f"Successfully upserted maintainer {maintainer.github_username} "
f"with identity_id {identity_id}"
)
else:
self.logger.warning(f"Identity not found for GitHub user: {maintainer}")

semaphore = asyncio.Semaphore(3)

async def process_with_semaphore(maintainer: MaintainerInfoItem):
async with semaphore:
await process_maintainer(maintainer)

await asyncio.gather(*[process_with_semaphore(maintainer) for maintainer in maintainers])
await asyncio.gather(*[upsert(m, identity_id) for m, identity_id in resolved])

async def compare_and_update_maintainers(
self,
Expand All @@ -200,63 +216,73 @@ async def compare_and_update_maintainers(
):
self.logger.info(f"Comparing and updating maintainers for repo: {repo_id}")
current_maintainers = await get_maintainers_for_repo(repo_id)
current_maintainers_dict = {m["github_username"]: m for m in current_maintainers}
new_maintainers_dict = {m.github_username: m for m in maintainers}

for github_username, maintainer in new_maintainers_dict.items():
role = maintainer.normalized_title
# Key by (identityId, role) — keying by github_username collapsed every
# "unknown" extraction into one slot, silently dropping most email-only
# maintainers (~4k of 4216 entries on the linux MAINTAINERS file).
current_by_key: dict[tuple[str, str], dict] = {
(m["identityId"], m["role"]): m for m in current_maintainers
}

# Resolve before keying so the comparison is identity-based: the same
# person may extract with different github_username values across runs.
resolved = await self._resolve_maintainers(maintainers)
new_by_key: dict[tuple[str, str], MaintainerInfoItem] = {
(identity_id, m.normalized_title): m for m, identity_id in resolved
}

for (identity_id, role), maintainer in new_by_key.items():
if (identity_id, role) in current_by_key:
continue
original_role = self.make_role(maintainer.title)
if github_username == "unknown" and maintainer.email in ("unknown", None):
self.logger.warning(
f"Skipping unknown github_username & email with title {maintainer.title}"
)
await upsert_maintainer(
repo_id, identity_id, repo_url, role, original_role, start_date=change_date
)
self.logger.info(
f"Inserted new maintainer {maintainer.github_username} "
f"with identity_id {identity_id} role {role}"
)

# Safety guard scoped to entries whose identity resolution FAILED this run.
# A maintainer who resolves but ends up under a different (identityId, role)
# — i.e. a role change — must still be end-dated on the old role row, so we
# only protect values from extractor entries that did not resolve. Matching
# is kind-aware so a GitHub username "foo" cannot collide with a same-named
# handle on another platform (different person).
resolved_ids = {id(m) for m, _ in resolved}
unresolved_usernames: set[str] = set()
unresolved_emails: set[str] = set()
for m in maintainers:
if id(m) in resolved_ids:
continue
elif github_username not in current_maintainers_dict:
# New maintainer
identity_id = (
await find_github_identity(github_username)
if github_username != "unknown"
else await find_maintainer_identity_by_email(maintainer.email)
)
self.logger.info(f"Found new maintainer {github_username} to be inserted")
if identity_id:
await upsert_maintainer(
repo_id, identity_id, repo_url, role, original_role, start_date=change_date
)
self.logger.info(
f"Successfully inserted new maintainer {github_username} with identity_id {identity_id}"
)
else:
# will happen for new users if their identity isn't created yet but should be fixed on the next iteration
self.logger.warning(f"Identity not found for username: {github_username}")
else:
# Existing maintainer
current_maintainer = current_maintainers_dict[github_username]
if current_maintainer["role"] != role:
# Role has changed: we update maintainer
self.logger.info(
f"Role changed from {current_maintainer['role']} to {role} for maintainer {current_maintainer['identityId']}"
)
await upsert_maintainer(
repo_id,
current_maintainer["identityId"],
repo_url,
role,
original_role,
change_date,
)
if m.github_username and m.github_username != "unknown":
unresolved_usernames.add(m.github_username.lower())
if m.email and m.email != "unknown":
unresolved_emails.add(m.email.lower())

for github_username, current_maintainer in current_maintainers_dict.items():
if github_username not in new_maintainers_dict:
self.logger.info(
f"Maintainer {github_username} with identity {current_maintainer['identityId']} no longer exists, updating its endDate..."
)
await set_maintainer_end_date(
repo_id,
current_maintainer["identityId"],
current_maintainer["role"],
change_date,
for (identity_id, role), current in current_by_key.items():
if (identity_id, role) in new_by_key:
continue
current_value = (current.get("identity_value") or "").lower()
current_platform = current.get("platform")
current_type = current.get("type")
is_github_username = current_platform == "github" and current_type == "username"
is_email = current_type == "email"
skip_end_date = bool(current_value) and (
(is_github_username and current_value in unresolved_usernames)
or (is_email and current_value in unresolved_emails)
)
Comment thread
joanagmaia marked this conversation as resolved.
if skip_end_date:
self.logger.warning(
f"Maintainer with identity {identity_id} role {role} could not be "
f"re-resolved but is still mentioned in the source; skipping end-date"
)
continue
self.logger.info(
f"Maintainer with identity {identity_id} role {role} no longer exists, "
f"updating its endDate..."
)
await set_maintainer_end_date(repo_id, identity_id, role, change_date)

async def save_maintainers(
self,
Expand Down Expand Up @@ -941,13 +967,16 @@ async def exclude_parent_repo_maintainers(
if not parent_repo or not extracted_maintainers:
return extracted_maintainers

parent_repo_maintainers = await get_maintainers_for_repo(parent_repo.id)
if not parent_repo_maintainers:
self.logger.info(f"No maintainers found for parent repo {parent_repo.url}")
# Dedicated github-username lookup: get_maintainers_for_repo now returns any
# identity type (email-linked rows included), but this filter compares against
# extracted github_username values, so we must narrow to platform='github'/type='username'.
parent_github_usernames = await get_github_maintainer_usernames_for_repo(parent_repo.id)
if not parent_github_usernames:
self.logger.info(
f"No github-username maintainers found for parent repo {parent_repo.url}"
)
Comment thread
joanagmaia marked this conversation as resolved.
return extracted_maintainers

parent_github_usernames = {m["github_username"] for m in parent_repo_maintainers}

fork_only_maintainers = [
maintainer
for maintainer in extracted_maintainers
Expand Down
Loading