diff --git a/backend/src/database/migrations/U1778154987__addLicenseToRepositories.sql b/backend/src/database/migrations/U1778154987__addLicenseToRepositories.sql new file mode 100644 index 0000000000..32d205e2a7 --- /dev/null +++ b/backend/src/database/migrations/U1778154987__addLicenseToRepositories.sql @@ -0,0 +1 @@ +ALTER TABLE public.repositories DROP COLUMN license; diff --git a/backend/src/database/migrations/V1778154987__addLicenseToRepositories.sql b/backend/src/database/migrations/V1778154987__addLicenseToRepositories.sql new file mode 100644 index 0000000000..a0285f5fea --- /dev/null +++ b/backend/src/database/migrations/V1778154987__addLicenseToRepositories.sql @@ -0,0 +1 @@ +ALTER TABLE public.repositories ADD COLUMN license VARCHAR(255); diff --git a/scripts/services/docker/Dockerfile.git_integration b/scripts/services/docker/Dockerfile.git_integration index 4c9c371007..3fd1c89a1b 100644 --- a/scripts/services/docker/Dockerfile.git_integration +++ b/scripts/services/docker/Dockerfile.git_integration @@ -83,10 +83,17 @@ RUN apt-get update && apt-get install -y \ ca-certificates \ git \ ripgrep \ + ruby \ + libgit2-1.1 \ + ruby-dev \ + build-essential \ + libgit2-dev \ + cmake \ + pkg-config \ --no-install-recommends \ - && rm -rf /var/lib/apt/lists/* \ - && apt-get clean \ - && apt-get autoremove -y + && gem install licensee -v '9.15.3' --no-document \ + && apt-get remove --autoremove -y ruby-dev build-essential libgit2-dev cmake pkg-config \ + && rm -rf /var/lib/apt/lists/* ENV PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 \ diff --git a/services/apps/git_integration/src/crowdgit/database/crud.py b/services/apps/git_integration/src/crowdgit/database/crud.py index 0c48c0d5ac..a3451cc9c9 100644 --- a/services/apps/git_integration/src/crowdgit/database/crud.py +++ b/services/apps/git_integration/src/crowdgit/database/crud.py @@ -283,6 +283,17 @@ async def update_last_processed_commit(repo_id: str, commit_hash: str, branch: s return str(result) +async def update_repository_license(repository_id: str, license_spdx: str | None) -> None: + sql_query = """ + UPDATE public.repositories + SET license = $1::varchar, + "updatedAt" = NOW() + WHERE id = $2 + AND license IS DISTINCT FROM $1::varchar + """ + await execute(sql_query, (license_spdx, repository_id)) + + async def mark_repo_as_processed(repo_id: str, repo_state: RepositoryState): sql_query = """ UPDATE git."repositoryProcessing" diff --git a/services/apps/git_integration/src/crowdgit/server.py b/services/apps/git_integration/src/crowdgit/server.py index 7483680592..9aee058fd3 100644 --- a/services/apps/git_integration/src/crowdgit/server.py +++ b/services/apps/git_integration/src/crowdgit/server.py @@ -8,6 +8,7 @@ from crowdgit.services import ( CloneService, CommitService, + LicenseService, MaintainerService, QueueService, SoftwareValueService, @@ -28,6 +29,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]: software_value_service = SoftwareValueService() vulnerability_scanner_service = VulnerabilityScannerService() maintainer_service = MaintainerService() + license_service = LicenseService() worker_task = None worker = RepositoryWorker( @@ -36,6 +38,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]: software_value_service=software_value_service, vulnerability_scanner_service=vulnerability_scanner_service, maintainer_service=maintainer_service, + license_service=license_service, queue_service=queue_service, ) logger.info("Repo worker initialized") diff --git a/services/apps/git_integration/src/crowdgit/services/__init__.py b/services/apps/git_integration/src/crowdgit/services/__init__.py index bb0c4eca39..6f7c2d5051 100644 --- a/services/apps/git_integration/src/crowdgit/services/__init__.py +++ b/services/apps/git_integration/src/crowdgit/services/__init__.py @@ -1,6 +1,7 @@ from crowdgit.services.base.base_service import BaseService from crowdgit.services.clone.clone_service import CloneService from crowdgit.services.commit.commit_service import CommitService +from crowdgit.services.license.license_service import LicenseService from crowdgit.services.maintainer.maintainer_service import MaintainerService from crowdgit.services.queue.queue_service import QueueService from crowdgit.services.software_value.software_value_service import SoftwareValueService @@ -12,6 +13,7 @@ "BaseService", "CloneService", "CommitService", + "LicenseService", "SoftwareValueService", "VulnerabilityScannerService", "MaintainerService", diff --git a/services/apps/git_integration/src/crowdgit/services/license/__init__.py b/services/apps/git_integration/src/crowdgit/services/license/__init__.py new file mode 100644 index 0000000000..be33c0ace6 --- /dev/null +++ b/services/apps/git_integration/src/crowdgit/services/license/__init__.py @@ -0,0 +1,3 @@ +from crowdgit.services.license.license_service import LicenseService + +__all__ = ["LicenseService"] diff --git a/services/apps/git_integration/src/crowdgit/services/license/license_service.py b/services/apps/git_integration/src/crowdgit/services/license/license_service.py new file mode 100644 index 0000000000..92f9375488 --- /dev/null +++ b/services/apps/git_integration/src/crowdgit/services/license/license_service.py @@ -0,0 +1,49 @@ +import json + +from crowdgit.errors import CommandExecutionError, CommandTimeoutError +from crowdgit.services.base.base_service import BaseService +from crowdgit.services.utils import run_shell_command + + +class LicenseService(BaseService): + """Detects SPDX license from a cloned repository using the licensee gem.""" + + async def detect(self, repo_path: str) -> str | None: + """Run licensee against repo_path and return the SPDX identifier, or None.""" + try: + output = await run_shell_command( + ["licensee", "detect", "--json", repo_path], timeout=60 + ) + except CommandExecutionError: + self.logger.info(f"licensee found no license in {repo_path}") + return None + except CommandTimeoutError as e: + self.logger.warning(f"licensee timed out: {repr(e)}") + return None + except FileNotFoundError as e: + self.logger.warning(f"licensee binary not found in PATH: {repr(e)}") + return None + except Exception as e: + self.logger.warning(f"licensee failed: {repr(e)}") + return None + + try: + data = json.loads(output) + licenses = data.get("licenses") or [] + matched_files = data.get("matched_files") or [] + spdx_id = licenses[0].get("spdx_id") if licenses else None + confidence = ( + (matched_files[0].get("matcher") or {}).get("confidence") + if matched_files + else None + ) + if spdx_id: + self.logger.info( + f"License detected: {spdx_id} (confidence={confidence}) in {repo_path}" + ) + else: + self.logger.info(f"No SPDX license matched in {repo_path}") + return spdx_id + except Exception as e: + self.logger.warning(f"Failed to parse licensee output: {repr(e)}") + return None diff --git a/services/apps/git_integration/src/crowdgit/worker/repository_worker.py b/services/apps/git_integration/src/crowdgit/worker/repository_worker.py index d949c58f84..205637bca4 100644 --- a/services/apps/git_integration/src/crowdgit/worker/repository_worker.py +++ b/services/apps/git_integration/src/crowdgit/worker/repository_worker.py @@ -7,6 +7,7 @@ mark_repo_as_processed, release_repo, update_last_processed_commit, + update_repository_license, ) from crowdgit.enums import RepositoryState from crowdgit.errors import ( @@ -22,6 +23,7 @@ from crowdgit.services import ( CloneService, CommitService, + LicenseService, MaintainerService, QueueService, SoftwareValueService, @@ -46,6 +48,7 @@ def __init__( software_value_service: SoftwareValueService, vulnerability_scanner_service: VulnerabilityScannerService, maintainer_service: MaintainerService, + license_service: LicenseService, queue_service: QueueService, ): self.clone_service = clone_service @@ -53,6 +56,7 @@ def __init__( self.software_value_service = software_value_service self.vulnerability_scanner_service = vulnerability_scanner_service self.maintainer_service = maintainer_service + self.license_service = license_service self.queue_service = queue_service self._shutdown = False @@ -159,6 +163,7 @@ def _bind_repository_context(self, repository: Repository, repo_name: str) -> No (self.maintainer_service, "maintainer_processing"), (self.software_value_service, "software_value_processing"), (self.vulnerability_scanner_service, "vulnerability_scan_processing"), + (self.license_service, "license_detection"), (self.queue_service, "queue_service"), ] @@ -174,6 +179,7 @@ def _reset_all_contexts(self) -> None: self.maintainer_service, self.software_value_service, self.vulnerability_scanner_service, + self.license_service, self.queue_service, ] @@ -236,6 +242,8 @@ async def _process_single_repository(self, repository: Repository): repository.id, batch_info.repo_path, repository.url ) await self.maintainer_service.process_maintainers(repository, batch_info) + license_spdx = await self.license_service.detect(batch_info.repo_path) + await update_repository_license(repository.id, license_spdx) await self.commit_service.process_single_batch_commits( repository, batch_info, diff --git a/services/apps/members_enrichment_worker/src/activities/member.ts b/services/apps/members_enrichment_worker/src/activities/member.ts index 36225a1176..e94b34f6e9 100644 --- a/services/apps/members_enrichment_worker/src/activities/member.ts +++ b/services/apps/members_enrichment_worker/src/activities/member.ts @@ -77,15 +77,8 @@ export async function getIdentitiesExistInOtherMembers( excludeMemberId: string, identities: IMemberIdentity[], ): Promise { - let rows: IMemberIdentity[] = [] - - try { - const db = svc.postgres.reader - rows = await getIdentitiesExistInOthers(db, excludeMemberId, identities) - } catch (err) { - throw err - } - + const db = svc.postgres.reader + const rows = await getIdentitiesExistInOthers(db, excludeMemberId, identities) return rows } @@ -94,25 +87,21 @@ export async function updateMemberWithEnrichmentData( identities: IMemberIdentity[], attributes?: IAttributes, ): Promise { - try { - await svc.postgres.writer.connection().tx(async (tx) => { - for (const identity of identities) { - await createMemberIdentity(new PgPromiseQueryExecutor(tx), { - memberId, - platform: identity.platform, - value: identity.value, - type: identity.type, - verified: identity.verified || false, - source: 'enrichment', - }) - } - if (attributes) { - await updateMemberAttributes(tx, memberId, attributes) - } - }) - } catch (err) { - throw err - } + await svc.postgres.writer.connection().tx(async (tx) => { + for (const identity of identities) { + await createMemberIdentity(new PgPromiseQueryExecutor(tx), { + memberId, + platform: identity.platform, + value: identity.value, + type: identity.type, + verified: identity.verified || false, + source: 'enrichment', + }) + } + if (attributes) { + await updateMemberAttributes(tx, memberId, attributes) + } + }) } export async function mergeMembers( diff --git a/services/libs/data-access-layer/src/repositories/index.ts b/services/libs/data-access-layer/src/repositories/index.ts index d0960fe6cc..8f1ca19f60 100644 --- a/services/libs/data-access-layer/src/repositories/index.ts +++ b/services/libs/data-access-layer/src/repositories/index.ts @@ -21,6 +21,7 @@ export interface IRepository { updatedAt: string deletedAt: string | null lastArchivedCheckAt: string | null + license: string | null } export interface ICreateRepository { @@ -148,7 +149,8 @@ export async function getRepositoriesBySourceIntegrationId( "createdAt", "updatedAt", "deletedAt", - "lastArchivedCheckAt" + "lastArchivedCheckAt", + license FROM public.repositories WHERE "sourceIntegrationId" = $(sourceIntegrationId) AND "deletedAt" IS NULL @@ -190,7 +192,8 @@ export async function getRepositoriesByUrl( "createdAt", "updatedAt", "deletedAt", - "lastArchivedCheckAt" + "lastArchivedCheckAt", + license FROM public.repositories WHERE url IN ($(repoUrls:csv)) ${deletedFilter}