From 0f1147036fc9368dfc505208712036e0bbb802cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C5=A1per=20Grom?= Date: Thu, 7 May 2026 13:01:30 +0100 Subject: [PATCH 1/7] feat: detect and store repo license via licensee IN-1105 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Gašper Grom --- .../U1778154987__addLicenseToRepositories.sql | 1 + .../V1778154987__addLicenseToRepositories.sql | 1 + .../docker/Dockerfile.git_integration | 9 +++-- .../src/crowdgit/database/crud.py | 11 +++++ .../git_integration/src/crowdgit/server.py | 3 ++ .../src/crowdgit/services/__init__.py | 2 + .../src/crowdgit/services/license/__init__.py | 3 ++ .../services/license/license_service.py | 40 +++++++++++++++++++ .../src/crowdgit/worker/repository_worker.py | 6 +++ 9 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 backend/src/database/migrations/U1778154987__addLicenseToRepositories.sql create mode 100644 backend/src/database/migrations/V1778154987__addLicenseToRepositories.sql create mode 100644 services/apps/git_integration/src/crowdgit/services/license/__init__.py create mode 100644 services/apps/git_integration/src/crowdgit/services/license/license_service.py diff --git a/backend/src/database/migrations/U1778154987__addLicenseToRepositories.sql b/backend/src/database/migrations/U1778154987__addLicenseToRepositories.sql new file mode 100644 index 0000000000..32d205e2a7 --- /dev/null +++ b/backend/src/database/migrations/U1778154987__addLicenseToRepositories.sql @@ -0,0 +1 @@ +ALTER TABLE public.repositories DROP COLUMN license; diff --git a/backend/src/database/migrations/V1778154987__addLicenseToRepositories.sql b/backend/src/database/migrations/V1778154987__addLicenseToRepositories.sql new file mode 100644 index 0000000000..a0285f5fea --- /dev/null +++ b/backend/src/database/migrations/V1778154987__addLicenseToRepositories.sql @@ -0,0 +1 @@ +ALTER TABLE public.repositories ADD COLUMN license VARCHAR(255); diff --git a/scripts/services/docker/Dockerfile.git_integration b/scripts/services/docker/Dockerfile.git_integration index 4c9c371007..dd48420395 100644 --- a/scripts/services/docker/Dockerfile.git_integration +++ b/scripts/services/docker/Dockerfile.git_integration @@ -83,10 +83,13 @@ RUN apt-get update && apt-get install -y \ ca-certificates \ git \ ripgrep \ + ruby \ + ruby-dev \ + build-essential \ --no-install-recommends \ - && rm -rf /var/lib/apt/lists/* \ - && apt-get clean \ - && apt-get autoremove -y + && gem install licensee -v '10.0.0' --no-document \ + && apt-get remove --autoremove -y ruby-dev build-essential \ + && rm -rf /var/lib/apt/lists/* ENV PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 \ diff --git a/services/apps/git_integration/src/crowdgit/database/crud.py b/services/apps/git_integration/src/crowdgit/database/crud.py index 0c48c0d5ac..e42649d910 100644 --- a/services/apps/git_integration/src/crowdgit/database/crud.py +++ b/services/apps/git_integration/src/crowdgit/database/crud.py @@ -283,6 +283,17 @@ async def update_last_processed_commit(repo_id: str, commit_hash: str, branch: s return str(result) +async def update_repository_license(repository_id: str, license_spdx: str | None) -> None: + sql_query = """ + UPDATE public.repositories + SET license = $1, + "updatedAt" = NOW() + WHERE id = $2 + AND ($1 IS NOT NULL OR license IS NULL) + """ + await execute(sql_query, (license_spdx, repository_id)) + + async def mark_repo_as_processed(repo_id: str, repo_state: RepositoryState): sql_query = """ UPDATE git."repositoryProcessing" diff --git a/services/apps/git_integration/src/crowdgit/server.py b/services/apps/git_integration/src/crowdgit/server.py index 7483680592..9aee058fd3 100644 --- a/services/apps/git_integration/src/crowdgit/server.py +++ b/services/apps/git_integration/src/crowdgit/server.py @@ -8,6 +8,7 @@ from crowdgit.services import ( CloneService, CommitService, + LicenseService, MaintainerService, QueueService, SoftwareValueService, @@ -28,6 +29,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]: software_value_service = SoftwareValueService() vulnerability_scanner_service = VulnerabilityScannerService() maintainer_service = MaintainerService() + license_service = LicenseService() worker_task = None worker = RepositoryWorker( @@ -36,6 +38,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]: software_value_service=software_value_service, vulnerability_scanner_service=vulnerability_scanner_service, maintainer_service=maintainer_service, + license_service=license_service, queue_service=queue_service, ) logger.info("Repo worker initialized") diff --git a/services/apps/git_integration/src/crowdgit/services/__init__.py b/services/apps/git_integration/src/crowdgit/services/__init__.py index bb0c4eca39..6f7c2d5051 100644 --- a/services/apps/git_integration/src/crowdgit/services/__init__.py +++ b/services/apps/git_integration/src/crowdgit/services/__init__.py @@ -1,6 +1,7 @@ from crowdgit.services.base.base_service import BaseService from crowdgit.services.clone.clone_service import CloneService from crowdgit.services.commit.commit_service import CommitService +from crowdgit.services.license.license_service import LicenseService from crowdgit.services.maintainer.maintainer_service import MaintainerService from crowdgit.services.queue.queue_service import QueueService from crowdgit.services.software_value.software_value_service import SoftwareValueService @@ -12,6 +13,7 @@ "BaseService", "CloneService", "CommitService", + "LicenseService", "SoftwareValueService", "VulnerabilityScannerService", "MaintainerService", diff --git a/services/apps/git_integration/src/crowdgit/services/license/__init__.py b/services/apps/git_integration/src/crowdgit/services/license/__init__.py new file mode 100644 index 0000000000..be33c0ace6 --- /dev/null +++ b/services/apps/git_integration/src/crowdgit/services/license/__init__.py @@ -0,0 +1,3 @@ +from crowdgit.services.license.license_service import LicenseService + +__all__ = ["LicenseService"] diff --git a/services/apps/git_integration/src/crowdgit/services/license/license_service.py b/services/apps/git_integration/src/crowdgit/services/license/license_service.py new file mode 100644 index 0000000000..5ba3f22f4f --- /dev/null +++ b/services/apps/git_integration/src/crowdgit/services/license/license_service.py @@ -0,0 +1,40 @@ +import json + +from crowdgit.errors import CommandExecutionError, CommandTimeoutError +from crowdgit.services.base.base_service import BaseService +from crowdgit.services.utils import run_shell_command + + +class LicenseService(BaseService): + """Detects SPDX license from a cloned repository using the licensee gem.""" + + async def detect(self, repo_path: str) -> str | None: + """Run licensee against repo_path and return the SPDX identifier, or None.""" + try: + output = await run_shell_command(["licensee", "detect", "--json", repo_path]) + except CommandExecutionError: + self.logger.info(f"licensee found no license in {repo_path}") + return None + except CommandTimeoutError as e: + self.logger.warning(f"licensee timed out: {repr(e)}") + return None + except FileNotFoundError as e: + self.logger.warning(f"licensee binary not found in PATH: {repr(e)}") + return None + except Exception as e: + self.logger.warning(f"licensee failed: {repr(e)}") + return None + + try: + data = json.loads(output) + matched = data.get("matched_license") or {} + spdx_id = matched.get("spdx_id") + confidence = matched.get("confidence") + if spdx_id: + self.logger.info(f"License detected: {spdx_id} (confidence={confidence}) in {repo_path}") + else: + self.logger.info(f"No SPDX license matched in {repo_path}") + return spdx_id + except Exception as e: + self.logger.warning(f"Failed to parse licensee output: {repr(e)}") + return None diff --git a/services/apps/git_integration/src/crowdgit/worker/repository_worker.py b/services/apps/git_integration/src/crowdgit/worker/repository_worker.py index d949c58f84..fdf598de98 100644 --- a/services/apps/git_integration/src/crowdgit/worker/repository_worker.py +++ b/services/apps/git_integration/src/crowdgit/worker/repository_worker.py @@ -7,6 +7,7 @@ mark_repo_as_processed, release_repo, update_last_processed_commit, + update_repository_license, ) from crowdgit.enums import RepositoryState from crowdgit.errors import ( @@ -22,6 +23,7 @@ from crowdgit.services import ( CloneService, CommitService, + LicenseService, MaintainerService, QueueService, SoftwareValueService, @@ -46,6 +48,7 @@ def __init__( software_value_service: SoftwareValueService, vulnerability_scanner_service: VulnerabilityScannerService, maintainer_service: MaintainerService, + license_service: LicenseService, queue_service: QueueService, ): self.clone_service = clone_service @@ -53,6 +56,7 @@ def __init__( self.software_value_service = software_value_service self.vulnerability_scanner_service = vulnerability_scanner_service self.maintainer_service = maintainer_service + self.license_service = license_service self.queue_service = queue_service self._shutdown = False @@ -236,6 +240,8 @@ async def _process_single_repository(self, repository: Repository): repository.id, batch_info.repo_path, repository.url ) await self.maintainer_service.process_maintainers(repository, batch_info) + license_spdx = await self.license_service.detect(batch_info.repo_path) + await update_repository_license(repository.id, license_spdx) await self.commit_service.process_single_batch_commits( repository, batch_info, From 19bd389764b8b50327ee824ca89aac01080df41f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C5=A1per=20Grom?= Date: Fri, 8 May 2026 10:07:39 +0100 Subject: [PATCH 2/7] fix: correct licensee version, JSON parsing, and null type cast IN-1105 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Gašper Grom --- scripts/services/docker/Dockerfile.git_integration | 8 ++++++-- .../apps/git_integration/src/crowdgit/database/crud.py | 4 ++-- .../src/crowdgit/services/license/license_service.py | 7 ++++--- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/scripts/services/docker/Dockerfile.git_integration b/scripts/services/docker/Dockerfile.git_integration index dd48420395..3fd1c89a1b 100644 --- a/scripts/services/docker/Dockerfile.git_integration +++ b/scripts/services/docker/Dockerfile.git_integration @@ -84,11 +84,15 @@ RUN apt-get update && apt-get install -y \ git \ ripgrep \ ruby \ + libgit2-1.1 \ ruby-dev \ build-essential \ + libgit2-dev \ + cmake \ + pkg-config \ --no-install-recommends \ - && gem install licensee -v '10.0.0' --no-document \ - && apt-get remove --autoremove -y ruby-dev build-essential \ + && gem install licensee -v '9.15.3' --no-document \ + && apt-get remove --autoremove -y ruby-dev build-essential libgit2-dev cmake pkg-config \ && rm -rf /var/lib/apt/lists/* ENV PYTHONUNBUFFERED=1 \ diff --git a/services/apps/git_integration/src/crowdgit/database/crud.py b/services/apps/git_integration/src/crowdgit/database/crud.py index e42649d910..8899af11c7 100644 --- a/services/apps/git_integration/src/crowdgit/database/crud.py +++ b/services/apps/git_integration/src/crowdgit/database/crud.py @@ -286,10 +286,10 @@ async def update_last_processed_commit(repo_id: str, commit_hash: str, branch: s async def update_repository_license(repository_id: str, license_spdx: str | None) -> None: sql_query = """ UPDATE public.repositories - SET license = $1, + SET license = $1::varchar, "updatedAt" = NOW() WHERE id = $2 - AND ($1 IS NOT NULL OR license IS NULL) + AND ($1::varchar IS NOT NULL OR license IS NULL) """ await execute(sql_query, (license_spdx, repository_id)) diff --git a/services/apps/git_integration/src/crowdgit/services/license/license_service.py b/services/apps/git_integration/src/crowdgit/services/license/license_service.py index 5ba3f22f4f..59889d6c1c 100644 --- a/services/apps/git_integration/src/crowdgit/services/license/license_service.py +++ b/services/apps/git_integration/src/crowdgit/services/license/license_service.py @@ -27,9 +27,10 @@ async def detect(self, repo_path: str) -> str | None: try: data = json.loads(output) - matched = data.get("matched_license") or {} - spdx_id = matched.get("spdx_id") - confidence = matched.get("confidence") + licenses = data.get("licenses") or [] + matched_files = data.get("matched_files") or [] + spdx_id = licenses[0].get("spdx_id") if licenses else None + confidence = (matched_files[0].get("matcher") or {}).get("confidence") if matched_files else None if spdx_id: self.logger.info(f"License detected: {spdx_id} (confidence={confidence}) in {repo_path}") else: From 58d496872fad3ada0e175a4d7fb4157d401da7c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C5=A1per=20Grom?= Date: Fri, 8 May 2026 10:19:32 +0100 Subject: [PATCH 3/7] fix: ruff format license_service and add license to IRepository DAL IN-1105 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Gašper Grom --- .../src/crowdgit/services/license/license_service.py | 4 +++- services/libs/data-access-layer/src/repositories/index.ts | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/services/apps/git_integration/src/crowdgit/services/license/license_service.py b/services/apps/git_integration/src/crowdgit/services/license/license_service.py index 59889d6c1c..f87bec27ea 100644 --- a/services/apps/git_integration/src/crowdgit/services/license/license_service.py +++ b/services/apps/git_integration/src/crowdgit/services/license/license_service.py @@ -30,7 +30,9 @@ async def detect(self, repo_path: str) -> str | None: licenses = data.get("licenses") or [] matched_files = data.get("matched_files") or [] spdx_id = licenses[0].get("spdx_id") if licenses else None - confidence = (matched_files[0].get("matcher") or {}).get("confidence") if matched_files else None + confidence = ( + (matched_files[0].get("matcher") or {}).get("confidence") if matched_files else None + ) if spdx_id: self.logger.info(f"License detected: {spdx_id} (confidence={confidence}) in {repo_path}") else: diff --git a/services/libs/data-access-layer/src/repositories/index.ts b/services/libs/data-access-layer/src/repositories/index.ts index d0960fe6cc..71615160da 100644 --- a/services/libs/data-access-layer/src/repositories/index.ts +++ b/services/libs/data-access-layer/src/repositories/index.ts @@ -21,6 +21,7 @@ export interface IRepository { updatedAt: string deletedAt: string | null lastArchivedCheckAt: string | null + license: string | null } export interface ICreateRepository { From a1b16380b204f7aa05d72da728dbe1317efa28f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C5=A1per=20Grom?= Date: Fri, 8 May 2026 10:39:38 +0100 Subject: [PATCH 4/7] fix: ruff line length violations in license_service IN-1105 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Gašper Grom --- .../src/crowdgit/services/license/license_service.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/services/license/license_service.py b/services/apps/git_integration/src/crowdgit/services/license/license_service.py index f87bec27ea..b31d517696 100644 --- a/services/apps/git_integration/src/crowdgit/services/license/license_service.py +++ b/services/apps/git_integration/src/crowdgit/services/license/license_service.py @@ -31,10 +31,14 @@ async def detect(self, repo_path: str) -> str | None: matched_files = data.get("matched_files") or [] spdx_id = licenses[0].get("spdx_id") if licenses else None confidence = ( - (matched_files[0].get("matcher") or {}).get("confidence") if matched_files else None + (matched_files[0].get("matcher") or {}).get("confidence") + if matched_files + else None ) if spdx_id: - self.logger.info(f"License detected: {spdx_id} (confidence={confidence}) in {repo_path}") + self.logger.info( + f"License detected: {spdx_id} (confidence={confidence}) in {repo_path}" + ) else: self.logger.info(f"No SPDX license matched in {repo_path}") return spdx_id From f7cff2187e1ffaba95f02e671f5c866ceeef848e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C5=A1per=20Grom?= Date: Fri, 8 May 2026 10:46:34 +0100 Subject: [PATCH 5/7] fix: remove useless try/catch wrappers in member activities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Gašper Grom --- .../src/activities/member.ts | 45 +++++++------------ 1 file changed, 17 insertions(+), 28 deletions(-) diff --git a/services/apps/members_enrichment_worker/src/activities/member.ts b/services/apps/members_enrichment_worker/src/activities/member.ts index 36225a1176..e94b34f6e9 100644 --- a/services/apps/members_enrichment_worker/src/activities/member.ts +++ b/services/apps/members_enrichment_worker/src/activities/member.ts @@ -77,15 +77,8 @@ export async function getIdentitiesExistInOtherMembers( excludeMemberId: string, identities: IMemberIdentity[], ): Promise { - let rows: IMemberIdentity[] = [] - - try { - const db = svc.postgres.reader - rows = await getIdentitiesExistInOthers(db, excludeMemberId, identities) - } catch (err) { - throw err - } - + const db = svc.postgres.reader + const rows = await getIdentitiesExistInOthers(db, excludeMemberId, identities) return rows } @@ -94,25 +87,21 @@ export async function updateMemberWithEnrichmentData( identities: IMemberIdentity[], attributes?: IAttributes, ): Promise { - try { - await svc.postgres.writer.connection().tx(async (tx) => { - for (const identity of identities) { - await createMemberIdentity(new PgPromiseQueryExecutor(tx), { - memberId, - platform: identity.platform, - value: identity.value, - type: identity.type, - verified: identity.verified || false, - source: 'enrichment', - }) - } - if (attributes) { - await updateMemberAttributes(tx, memberId, attributes) - } - }) - } catch (err) { - throw err - } + await svc.postgres.writer.connection().tx(async (tx) => { + for (const identity of identities) { + await createMemberIdentity(new PgPromiseQueryExecutor(tx), { + memberId, + platform: identity.platform, + value: identity.value, + type: identity.type, + verified: identity.verified || false, + source: 'enrichment', + }) + } + if (attributes) { + await updateMemberAttributes(tx, memberId, attributes) + } + }) } export async function mergeMembers( From 39a9da7412041ab687c6fd5813932c40869a49cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C5=A1per=20Grom?= Date: Fri, 8 May 2026 11:59:03 +0100 Subject: [PATCH 6/7] fix: address review comments on license detection IN-1105 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Gašper Grom --- services/apps/git_integration/src/crowdgit/database/crud.py | 2 +- .../src/crowdgit/services/license/license_service.py | 2 +- .../src/crowdgit/worker/repository_worker.py | 2 ++ services/libs/data-access-layer/src/repositories/index.ts | 6 ++++-- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/database/crud.py b/services/apps/git_integration/src/crowdgit/database/crud.py index 8899af11c7..a3451cc9c9 100644 --- a/services/apps/git_integration/src/crowdgit/database/crud.py +++ b/services/apps/git_integration/src/crowdgit/database/crud.py @@ -289,7 +289,7 @@ async def update_repository_license(repository_id: str, license_spdx: str | None SET license = $1::varchar, "updatedAt" = NOW() WHERE id = $2 - AND ($1::varchar IS NOT NULL OR license IS NULL) + AND license IS DISTINCT FROM $1::varchar """ await execute(sql_query, (license_spdx, repository_id)) diff --git a/services/apps/git_integration/src/crowdgit/services/license/license_service.py b/services/apps/git_integration/src/crowdgit/services/license/license_service.py index b31d517696..9a28f5ab75 100644 --- a/services/apps/git_integration/src/crowdgit/services/license/license_service.py +++ b/services/apps/git_integration/src/crowdgit/services/license/license_service.py @@ -11,7 +11,7 @@ class LicenseService(BaseService): async def detect(self, repo_path: str) -> str | None: """Run licensee against repo_path and return the SPDX identifier, or None.""" try: - output = await run_shell_command(["licensee", "detect", "--json", repo_path]) + output = await run_shell_command(["licensee", "detect", "--json", repo_path], timeout=60) except CommandExecutionError: self.logger.info(f"licensee found no license in {repo_path}") return None diff --git a/services/apps/git_integration/src/crowdgit/worker/repository_worker.py b/services/apps/git_integration/src/crowdgit/worker/repository_worker.py index fdf598de98..205637bca4 100644 --- a/services/apps/git_integration/src/crowdgit/worker/repository_worker.py +++ b/services/apps/git_integration/src/crowdgit/worker/repository_worker.py @@ -163,6 +163,7 @@ def _bind_repository_context(self, repository: Repository, repo_name: str) -> No (self.maintainer_service, "maintainer_processing"), (self.software_value_service, "software_value_processing"), (self.vulnerability_scanner_service, "vulnerability_scan_processing"), + (self.license_service, "license_detection"), (self.queue_service, "queue_service"), ] @@ -178,6 +179,7 @@ def _reset_all_contexts(self) -> None: self.maintainer_service, self.software_value_service, self.vulnerability_scanner_service, + self.license_service, self.queue_service, ] diff --git a/services/libs/data-access-layer/src/repositories/index.ts b/services/libs/data-access-layer/src/repositories/index.ts index 71615160da..8f1ca19f60 100644 --- a/services/libs/data-access-layer/src/repositories/index.ts +++ b/services/libs/data-access-layer/src/repositories/index.ts @@ -149,7 +149,8 @@ export async function getRepositoriesBySourceIntegrationId( "createdAt", "updatedAt", "deletedAt", - "lastArchivedCheckAt" + "lastArchivedCheckAt", + license FROM public.repositories WHERE "sourceIntegrationId" = $(sourceIntegrationId) AND "deletedAt" IS NULL @@ -191,7 +192,8 @@ export async function getRepositoriesByUrl( "createdAt", "updatedAt", "deletedAt", - "lastArchivedCheckAt" + "lastArchivedCheckAt", + license FROM public.repositories WHERE url IN ($(repoUrls:csv)) ${deletedFilter} From e51b77c5fc3a012b0b067a5b550dc135b0652c5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C5=A1per=20Grom?= Date: Fri, 8 May 2026 12:05:24 +0100 Subject: [PATCH 7/7] fix: wrap long run_shell_command call in license_service IN-1105 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Gašper Grom --- .../src/crowdgit/services/license/license_service.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/services/apps/git_integration/src/crowdgit/services/license/license_service.py b/services/apps/git_integration/src/crowdgit/services/license/license_service.py index 9a28f5ab75..92f9375488 100644 --- a/services/apps/git_integration/src/crowdgit/services/license/license_service.py +++ b/services/apps/git_integration/src/crowdgit/services/license/license_service.py @@ -11,7 +11,9 @@ class LicenseService(BaseService): async def detect(self, repo_path: str) -> str | None: """Run licensee against repo_path and return the SPDX identifier, or None.""" try: - output = await run_shell_command(["licensee", "detect", "--json", repo_path], timeout=60) + output = await run_shell_command( + ["licensee", "detect", "--json", repo_path], timeout=60 + ) except CommandExecutionError: self.logger.info(f"licensee found no license in {repo_path}") return None