Files
godot-interactive-changelog/compose-db.js
Rémi Verschelde e20ebc6886 composedb: Reduce commits per page to 50
We've started getting HTTP error 502 on requests for 150 commits since
one week.
2025-11-10 14:02:13 +01:00

183 lines
8.0 KiB
JavaScript

const DataFetcher = require('./build/utils/compose-fetcher.js');
const DataProcessor = require('./build/utils/compose-processor.js');
const DataIO = require('./build/utils/compose-io.js');
const COMMITS_PER_PAGE = 50;
async function main() {
// Internal utility methods.
const checkForExit = () => {
if (process.exitCode > 0) {
console.log(` Terminating with an exit code ${process.exitCode}.`);
process.exit();
}
};
// Getting PRs between two commits is a complicated task, and must be done in
// multiple steps. GitHub API does not have a method for that, so we must improvise.
// We also need to consider that there is no easy way to fetch information for
// an arbitrary list of commits; the API can work on ranges, but not on lists.
//
// We do not need to run this operation constantly. Release versions don't change.
// (Though some metadata of PRs can change, so re-indexing should be possible, on
// demand.)
// We also have to preconfigure some information, e.g. manually supply the tags
// or hashes, which serve as release boundaries.
console.log("[*] Building local commit and pull request database.");
const dataIO = new DataIO();
dataIO.parseArgs();
checkForExit();
await dataIO.loadConfig();
checkForExit();
const databaseName = `${dataIO.data_owner}.${dataIO.data_repo}.${dataIO.data_version}.json`;
console.log(`[*] Configured for the "${dataIO.data_owner}/${dataIO.data_repo}" repository; version ${dataIO.data_version}.`);
const dataFetcher = new DataFetcher(dataIO.data_owner, dataIO.data_repo);
const dataProcessor = new DataProcessor();
if (dataIO.update_data) {
console.log(`[*] Loading existing data to perform an update.`);
const oldData = await dataIO.loadData(databaseName);
dataProcessor.takeData(oldData);
}
console.log("[*] Checking the rate limits before.");
await dataFetcher.checkRates();
checkForExit();
// First, we checkout the repository for the specified branch/tag/hash. We will
// use it to retrieve a clean commit log. This step creates a shallow copy of the
// repository, as we are only interested in the history of the branch.
// Still, it extracts all of the current files, so it may take a bit of time.
if (dataIO.skip_checkout) {
console.log(`[*] Skipping the repository checkout.`);
} else {
console.log(`[*] Checking out the repository at "${dataIO.last_commit}".`);
await dataFetcher.checkoutRepo(dataIO.git_tag, dataIO.last_commit);
checkForExit();
}
if (dataIO.checkout_dir !== "") {
console.log(`[*] Using the local clone at "${dataIO.checkout_dir}".`);
}
if (dataIO.skip_gitlog) {
console.log(`[*] Skipping the commit log extraction.`);
dataProcessor.consumeOldLog();
} else {
console.log(`[*] Extracting the commit log between "${dataIO.first_commit}" and "${dataIO.last_commit}".`);
const commitLogSize = await dataFetcher.countCommitHistory(dataIO.first_commit, dataIO.last_commit, dataIO.checkout_dir);
const commitLog = await dataFetcher.getCommitHistory(dataIO.first_commit, dataIO.last_commit, dataIO.checkout_dir);
checkForExit();
// Second, we parse the extracted commit log, to generate a list of commit hashes
// for the next step. We also try to extract the information about this being a
// cherry-pick, and not the original commit. We can rely on the commit message body
// containing a certain string, from which we can take the original commit hash.
dataProcessor.processLog(commitLog, commitLogSize);
checkForExit();
// We also need to keep track of the commit history of each release within a version.
// Releases can, and most often do, include commits outside of the defined range. This
// happens when a contribution is authored before the defined range, but merged within
// it.
console.log(`[*] Extracting commit logs for releases.`);
for (let i = 0; i < dataIO.releases.length; i++) {
const release = dataIO.releases[i];
console.log(` Extracting the commit log for "${release.name}" (between "${release.from_ref}" and "${release.ref}").`);
const releaseLog = await dataFetcher.getCommitsBetween(release.from_ref, release.ref, dataIO.checkout_dir);
checkForExit();
console.log(` Processing the commit log for "${release.name}".`);
dataProcessor._processReleaseLog(release.name, releaseLog);
checkForExit();
}
}
// This method returns only non-merge commits; we don't need to fetch anything about
// merge commits. We only need them for a complete commit history.
const commitHashes = dataProcessor.getCommitHashes();
if (dataIO.skip_github) {
console.log(`[*] Skipping the commit data fetching from GitHub.`);
dataProcessor.consumeOldCommits();
} else {
// Third, we generate a query to the GraphQL API to fetch the information about
// linked PRs. GraphQL API doesn't have a filter to extract data for a list of
// commit hashes, but it supports having multiple sub-queries within the same request,
// which is our way in.
//
// While paginated queries are limited to 100 entries per page, sub-queries do not
// appear to be similarly limited. We are still limited by the total number of nodes
// we can theoretically fetch, which is 500 000. As such, we still want to do this
// in batches, so the number of nodes in each request is manageable.
console.log("[*] Fetching commit data from GitHub.");
let commitsRaw = {};
const totalPages = Math.ceil(commitHashes.length / COMMITS_PER_PAGE);
// Pages are starting with 1 for better presentation.
let page = 1;
while (page <= totalPages) {
const batchHashes = commitHashes.splice(0, COMMITS_PER_PAGE);
const batchCommits = await dataFetcher.fetchCommits(batchHashes, page, totalPages);
checkForExit();
Object.assign(commitsRaw, batchCommits);
page++;
// Wait for a bit before proceeding to avoid hitting the secondary rate limit in GitHub API.
// See https://docs.github.com/en/rest/guides/best-practices-for-integrators#dealing-with-secondary-rate-limits.
await dataFetcher.delay(DataFetcher.API_DELAY_MSEC);
// Add an extra delay every few requests, because the chance to trigger the hidden rate issue
// seems to grow with the number of queries.
if (page % 8 === 0) {
console.log("[*] Waiting a bit for the API to cool down...");
await dataFetcher.delay(DataFetcher.API_DELAY_MSEC * 4);
}
}
// Fourth, we consolidate the information. Commits are populated with links to their
// respective PRs, and PRs store references to their commits. We will save this to
// a file for the specified range, which should be between two stable releases.
//
// For intermediate releases (developer previews) we have preconfigured hashes and
// can simply pass them to the final data. Frontend will handle the rest.
console.log(`[*] Processing ${Object.keys(commitsRaw).length} commits.`);
dataProcessor.processCommits(commitsRaw, `${dataIO.data_owner}/${dataIO.data_repo}`);
checkForExit();
}
console.log("[*] Checking the rate limits after.")
await dataFetcher.checkRates();
checkForExit();
console.log("[*] Finalizing database.")
const output = {
"generated_at": Date.now(),
"log": dataProcessor.log,
"release_logs": dataProcessor.releaseLogs,
"authors": dataProcessor.authors,
"commits": dataProcessor.commits,
"pulls": dataProcessor.pulls,
};
await dataIO.saveData(databaseName, output);
checkForExit();
console.log("[*] Database built.");
}
main();