const DataFetcher = require('./build/utils/compose-fetcher.js'); const DataProcessor = require('./build/utils/compose-processor.js'); const DataIO = require('./build/utils/compose-io.js'); const COMMITS_PER_PAGE = 150; async function main() { // Internal utility methods. const checkForExit = () => { if (process.exitCode > 0) { console.log(` Terminating with an exit code ${process.exitCode}.`); process.exit(); } }; // Getting PRs between two commits is a complicated task, and must be done in // multiple steps. GitHub API does not have a method for that, so we must improvise. // We also need to consider that there is no easy way to fetch information for // an arbitrary list of commits; the API can work on ranges, but not on lists. // // We do not need to run this operation constantly. Release versions don't change. // (Though some metadata of PRs can change, so re-indexing should be possible, on // demand.) // We also have to preconfigure some information, e.g. manually supply the tags // or hashes, which serve as release boundaries. console.log("[*] Building local commit and pull request database."); const dataIO = new DataIO(); dataIO.parseArgs(); checkForExit(); await dataIO.loadConfig(); checkForExit(); const databaseName = `${dataIO.data_owner}.${dataIO.data_repo}.${dataIO.data_version}.json`; console.log(`[*] Configured for the "${dataIO.data_owner}/${dataIO.data_repo}" repository; version ${dataIO.data_version}.`); const dataFetcher = new DataFetcher(dataIO.data_owner, dataIO.data_repo); const dataProcessor = new DataProcessor(); if (dataIO.update_data) { console.log(`[*] Loading existing data to perform an update.`); const oldData = await dataIO.loadData(databaseName); dataProcessor.takeData(oldData); } console.log("[*] Checking the rate limits before."); await dataFetcher.checkRates(); checkForExit(); // First, we checkout the repository for the specified branch/tag/hash. We will // use it to retrieve a clean commit log. This step creates a shallow copy of the // repository, as we are only interested in the history of the branch. // Still, it extracts all of the current files, so it may take a bit of time. if (dataIO.skip_checkout) { console.log(`[*] Skipping the repository checkout.`); } else { console.log(`[*] Checking out the repository at "${dataIO.last_commit}".`); await dataFetcher.checkoutRepo(dataIO.git_tag, dataIO.last_commit); checkForExit(); } if (dataIO.checkout_dir !== "") { console.log(`[*] Using the local clone at "${dataIO.checkout_dir}".`); } if (dataIO.skip_gitlog) { console.log(`[*] Skipping the commit log extraction.`); dataProcessor.consumeOldLog(); } else { console.log(`[*] Extracting the commit log between "${dataIO.first_commit}" and "${dataIO.last_commit}".`); const commitLogSize = await dataFetcher.countCommitHistory(dataIO.first_commit, dataIO.last_commit, dataIO.checkout_dir); const commitLog = await dataFetcher.getCommitHistory(dataIO.first_commit, dataIO.last_commit, dataIO.checkout_dir); checkForExit(); // Second, we parse the extracted commit log, to generate a list of commit hashes // for the next step. We also try to extract the information about this being a // cherry-pick, and not the original commit. We can rely on the commit message body // containing a certain string, from which we can take the original commit hash. dataProcessor.processLog(commitLog, commitLogSize); checkForExit(); // We also need to keep track of the commit history of each release within a version. // Releases can, and most often do, include commits outside of the defined range. This // happens when a contribution is authored before the defined range, but merged within // it. console.log(`[*] Extracting commit logs for releases.`); for (let i = 0; i < dataIO.releases.length; i++) { const release = dataIO.releases[i]; console.log(` Extracting the commit log for "${release.name}" (between "${release.from_ref}" and "${release.ref}").`); const releaseLog = await dataFetcher.getCommitsBetween(release.from_ref, release.ref, dataIO.checkout_dir); checkForExit(); console.log(` Processing the commit log for "${release.name}".`); dataProcessor._processReleaseLog(release.name, releaseLog); checkForExit(); } } // This method returns only non-merge commits; we don't need to fetch anything about // merge commits. We only need them for a complete commit history. const commitHashes = dataProcessor.getCommitHashes(); if (dataIO.skip_github) { console.log(`[*] Skipping the commit data fetching from GitHub.`); dataProcessor.consumeOldCommits(); } else { // Third, we generate a query to the GraphQL API to fetch the information about // linked PRs. GraphQL API doesn't have a filter to extract data for a list of // commit hashes, but it supports having multiple sub-queries within the same request, // which is our way in. // // While paginated queries are limited to 100 entries per page, sub-queries do not // appear to be similarly limited. We are still limited by the total number of nodes // we can theoretically fetch, which is 500 000. As such, we still want to do this // in batches, so the number of nodes in each request is manageable. console.log("[*] Fetching commit data from GitHub."); let commitsRaw = {}; const totalPages = Math.ceil(commitHashes.length / COMMITS_PER_PAGE); // Pages are starting with 1 for better presentation. let page = 1; while (page <= totalPages) { const batchHashes = commitHashes.splice(0, COMMITS_PER_PAGE); const batchCommits = await dataFetcher.fetchCommits(batchHashes, page, totalPages); checkForExit(); Object.assign(commitsRaw, batchCommits); page++; // Wait for a bit before proceeding to avoid hitting the secondary rate limit in GitHub API. // See https://docs.github.com/en/rest/guides/best-practices-for-integrators#dealing-with-secondary-rate-limits. await dataFetcher.delay(DataFetcher.API_DELAY_MSEC); // Add an extra delay every few requests, because the chance to trigger the hidden rate issue // seems to grow with the number of queries. if (page % 8 === 0) { console.log("[*] Waiting a bit for the API to cool down..."); await dataFetcher.delay(DataFetcher.API_DELAY_MSEC * 4); } } // Fourth, we consolidate the information. Commits are populated with links to their // respective PRs, and PRs store references to their commits. We will save this to // a file for the specified range, which should be between two stable releases. // // For intermediate releases (developer previews) we have preconfigured hashes and // can simply pass them to the final data. Frontend will handle the rest. console.log(`[*] Processing ${Object.keys(commitsRaw).length} commits.`); dataProcessor.processCommits(commitsRaw, `${dataIO.data_owner}/${dataIO.data_repo}`); checkForExit(); } console.log("[*] Checking the rate limits after.") await dataFetcher.checkRates(); checkForExit(); console.log("[*] Finalizing database.") const output = { "generated_at": Date.now(), "log": dataProcessor.log, "release_logs": dataProcessor.releaseLogs, "authors": dataProcessor.authors, "commits": dataProcessor.commits, "pulls": dataProcessor.pulls, }; await dataIO.saveData(databaseName, output); checkForExit(); console.log("[*] Database built."); } main();