Store individual commit logs for each release within version

This commit is contained in:
Yuri Sizov
2023-05-30 21:58:47 +02:00
parent 9c8e32833c
commit 111adacfeb
50 changed files with 265 additions and 124 deletions

View File

@@ -117,6 +117,10 @@ class DataFetcher {
const commitHistory = stdout.trimEnd();
await this._logResponse(commitHistory, "_commit_shortlog", LogFormat.Raw);
if (commitHistory === "") {
return 0;
}
return commitHistory.split("\n").length;
} catch (err) {
console.error(" Error extracting the commit history: " + err);
@@ -142,6 +146,27 @@ class DataFetcher {
}
}
async getCommitsBetween(fromCommit, toCommit, repoFolder = "") {
try {
if (repoFolder === "") {
repoFolder = `./temp/${this.data_repo}`;
}
const { stdout, stderr } = await exec(`git log --pretty=format:"%H" ${fromCommit}..${toCommit}`, { cwd: repoFolder, maxBuffer: EXEC_MAX_BUFFER });
const commitHashes = stdout;
await this._logResponse(commitHashes, "_commit_hashes", LogFormat.Raw);
if (commitHashes === "") {
return [];
}
return commitHashes.split("\n");
} catch (err) {
console.error(" Error extracting the commit history: " + err);
process.exitCode = ExitCodes.ExecFailure;
return [];
}
}
async fetchGithub(query, retries = 0) {
const init = {};
init.method = "POST";
@@ -333,15 +358,44 @@ class DataFetcher {
class DataProcessor {
constructor() {
this.log = [];
this.releaseLogs = {};
this.authors = {};
this.commits = {};
this.pulls = {};
this.oldData = {};
}
_getCommitObject() {
takeData(dataObject) {
this.oldData = {
"log": dataObject.log || [],
"releaseLogs": dataObject.release_logs || {},
"commits": dataObject.commits || {},
"authors": dataObject.authors || {},
"pulls": dataObject.pulls || {},
};
}
consumeOldLog() {
this.log = this.oldData.log;
this.releaseLogs = this.oldData.releaseLogs;
this.commits = this.oldData.commits;
}
consumeOldCommits() {
this.authors = this.oldData.authors;
this.pulls = this.oldData.pulls;
}
_getCommitObject(commitHash) {
if (typeof this.oldData.commits[commitHash] !== "undefined") {
return this.oldData.commits[commitHash];
}
return {
"hash": "",
"hash": commitHash,
"is_merge": false,
"authored_by": [],
@@ -389,8 +443,7 @@ class DataProcessor {
return;
}
const originalCommit = this._getCommitObject();
originalCommit.hash = commit.cherrypick_hash;
const originalCommit = this._getCommitObject(commit.cherrypick_hash);
originalCommit.author_raw = commit.author_raw;
originalCommit.committer_raw = commit.author_raw;
@@ -412,6 +465,9 @@ class DataProcessor {
}
processLog(logRaw, logSize) {
this.log = [];
this.releaseLogs = {};
// Parse the log, given in its "full" format. Records are presented in
// the chronological order, line by line, with each record spanning across
// several lines.
@@ -452,8 +508,10 @@ class DataProcessor {
this._finishCommit(commit);
}
commit = this._getCommitObject();
commit.hash = matches[1];
commit = this._getCommitObject(matches[1]);
// These fields may come from the old data, we will override them.
commit.summary = "";
commit.body = "";
continue;
}
@@ -524,6 +582,9 @@ class DataProcessor {
}
processCommits(commitsRaw, targetRepo) {
this.authors = {};
this.pulls = {};
try {
for (let commitHash in commitsRaw) {
if (commitsRaw[commitHash] == null) {
@@ -541,6 +602,7 @@ class DataProcessor {
// can be authored by somebody else entirely. We will store them with the PR, and will
// display them as well on the frontend.
commit.authored_by = [];
const commitAuthors = mapNodes(item.authors);
commitAuthors.forEach((authorItem) => {
const authorId = this._processAuthor(authorItem.user);
@@ -618,6 +680,10 @@ class DataProcessor {
}
}
_processReleaseLog(releaseName, commitHashes) {
this.releaseLogs[releaseName] = commitHashes;
}
getCommitHashes() {
const commitHashes = [];
@@ -641,14 +707,21 @@ class DataIO {
this.data_repo = "godot";
this.data_version = "";
this.skip_checkout = false;
this.checkout_dir = "";
//
// Execution flags.
this.update_data = false
this.skip_checkout = false;
this.skip_gitlog = false;
this.skip_github = false;
// Loaded configuration.
this.config = null;
this.git_tag = "";
this.first_commit = ""
this.last_commit = "";
this.releases = [];
}
parseArgs() {
@@ -663,11 +736,21 @@ class DataIO {
this.data_version = arg.substring(8);
}
if (arg.indexOf("dir:") === 0) {
this.checkout_dir = arg.substring(4);
}
if (arg === "update-data") {
this.update_data = true;
}
if (arg === "skip-checkout") {
this.skip_checkout = true;
}
if (arg.indexOf("dir:") === 0) {
this.checkout_dir = arg.substring(4);
if (arg === "skip-gitlog") {
this.skip_gitlog = true;
}
if (arg === "skip-github") {
this.skip_github = true;
}
});
@@ -687,9 +770,11 @@ class DataIO {
const configContent = await fs.readFile(configPath);
this.config = JSON.parse(configContent);
this.git_tag = this.config.git_tag || this.config.ref;
this.first_commit = this.config.from_ref;
this.last_commit = this.config.ref;
this.first_commit = this.config.from_ref || "";
this.last_commit = this.config.ref || "";
this.releases = this.config.releases || [];
} catch (err) {
console.error(" Error loading version config file: " + err);
process.exitCode = ExitCodes.IOFailure;
@@ -697,14 +782,31 @@ class DataIO {
}
}
async saveData(output, fileName) {
async loadData(fileName) {
try {
console.log("[*] Storing database to a file.");
console.log("[*] Loading version database from a file.");
await ensureDir("./data");
await fs.writeFile(`./data/${fileName}`, JSON.stringify(output), {encoding: "utf-8"});
const databasePath = `./data/${fileName}`;
await fs.access(databasePath, fsConstants.R_OK);
const dataContent = await fs.readFile(databasePath);
return JSON.parse(dataContent);
} catch (err) {
console.error(" Error saving database file: " + err);
console.error(" Error loading version database file: " + err);
process.exitCode = ExitCodes.IOFailure;
return null;
}
}
async saveData(fileName, dataObject) {
try {
console.log("[*] Storing version database to a file.");
await ensureDir("./data");
await fs.writeFile(`./data/${fileName}`, JSON.stringify(dataObject), {encoding: "utf-8"});
} catch (err) {
console.error(" Error saving version database file: " + err);
process.exitCode = ExitCodes.IOFailure;
return;
}
@@ -786,21 +888,31 @@ async function main() {
await dataIO.loadConfig();
checkForExit();
const databaseName = `${dataIO.data_owner}.${dataIO.data_repo}.${dataIO.data_version}.json`;
console.log(`[*] Configured for the "${dataIO.data_owner}/${dataIO.data_repo}" repository; version ${dataIO.data_version}.`);
const dataFetcher = new DataFetcher(dataIO.data_owner, dataIO.data_repo);
const dataProcessor = new DataProcessor();
if (dataIO.update_data) {
console.log(`[*] Loading existing data to perform an update.`);
const oldData = await dataIO.loadData(databaseName);
dataProcessor.takeData(oldData);
}
console.log("[*] Checking the rate limits before.");
await dataFetcher.checkRates();
checkForExit();
// First, we checkout the repository for the specified branch/tag/hash. We will
// use it to retrieve a clean commit log, ignoring merge commits. This step creates
// as shallow copy, as we are only interested in the history of the branch.
// use it to retrieve a clean commit log. This step creates a shallow copy of the
// repository, as we are only interested in the history of the branch.
// Still, it extracts all of the current files, so it may take a bit of time.
if (!dataIO.skip_checkout) {
if (dataIO.skip_checkout) {
console.log(`[*] Skipping the repository checkout.`);
} else {
console.log(`[*] Checking out the repository at "${dataIO.last_commit}".`);
await dataFetcher.checkoutRepo(dataIO.git_tag, dataIO.last_commit);
checkForExit();
@@ -810,69 +922,97 @@ async function main() {
console.log(`[*] Using the local clone at "${dataIO.checkout_dir}".`);
}
console.log(`[*] Extracting the commit log between "${dataIO.first_commit}" and "${dataIO.last_commit}".`);
const commitLogSize = await dataFetcher.countCommitHistory(dataIO.first_commit, dataIO.last_commit, dataIO.checkout_dir);
const commitLog = await dataFetcher.getCommitHistory(dataIO.first_commit, dataIO.last_commit, dataIO.checkout_dir);
checkForExit();
// Second, we parse the extracted commit log, to generate a list of commit hashes
// for the next step. We also try to extract the information about this being a
// cherry-pick, and not the original commit. We can rely on the commit message body
// containing a certain string, from which we can take the original commit hash.
dataProcessor.processLog(commitLog, commitLogSize);
checkForExit();
// This method returns only non-merge commits; we don't need to fetch anything about
// merge commits. We only need them for commit history.
const commitHashes = dataProcessor.getCommitHashes();
// Third, we generate a query to the GraphQL API to fetch the information about
// linked PRs. GraphQL API doesn't have a filter to extract data for a list of
// commit hashes, but it supports having multiple sub-queries within the same request,
// which is our way in.
//
// While paginated queries are limited to 100 entries per page, sub-queries do not
// appear to be similarly limited. We are still limited by the total number of nodes
// we can theoretically fetch, which is 500 000. As such, we still want to do this
// in batches, so the number of nodes in each request is manageable.
console.log("[*] Fetching commit data from GitHub.");
let commitsRaw = {};
const totalPages = Math.ceil(commitHashes.length / COMMITS_PER_PAGE);
// Pages are starting with 1 for better presentation.
let page = 1;
while (page <= totalPages) {
const batchHashes = commitHashes.splice(0, COMMITS_PER_PAGE);
const batchCommits = await dataFetcher.fetchCommits(batchHashes, page, totalPages);
if (dataIO.skip_gitlog) {
console.log(`[*] Skipping the commit log extraction.`);
dataProcessor.consumeOldLog();
} else {
console.log(`[*] Extracting the commit log between "${dataIO.first_commit}" and "${dataIO.last_commit}".`);
const commitLogSize = await dataFetcher.countCommitHistory(dataIO.first_commit, dataIO.last_commit, dataIO.checkout_dir);
const commitLog = await dataFetcher.getCommitHistory(dataIO.first_commit, dataIO.last_commit, dataIO.checkout_dir);
checkForExit();
Object.assign(commitsRaw, batchCommits);
page++;
// Second, we parse the extracted commit log, to generate a list of commit hashes
// for the next step. We also try to extract the information about this being a
// cherry-pick, and not the original commit. We can rely on the commit message body
// containing a certain string, from which we can take the original commit hash.
// Wait for a bit before proceeding to avoid hitting the secondary rate limit in GitHub API.
// See https://docs.github.com/en/rest/guides/best-practices-for-integrators#dealing-with-secondary-rate-limits.
await dataFetcher.delay(API_DELAY_MSEC);
dataProcessor.processLog(commitLog, commitLogSize);
checkForExit();
// Add an extra delay every few requests, because the chance to trigger the hidden rate issue
// seems to grow with the number of queries.
if (page % 8 === 0) {
console.log("[*] Waiting a bit for the API to cool down...");
await dataFetcher.delay(API_DELAY_MSEC * 4);
// We also need to keep track of the commit history of each release within a version.
// Releases can, and most often do, include commits outside of the defined range. This
// happens when a contribution is authored before the defined range, but merged within
// it.
console.log(`[*] Extracting commit logs for releases.`);
for (let i = 0; i < dataIO.releases.length; i++) {
const release = dataIO.releases[i];
console.log(` Extracting the commit log for "${release.name}" (between "${release.from_ref}" and "${release.ref}").`);
const releaseLog = await dataFetcher.getCommitsBetween(release.from_ref, release.ref, dataIO.checkout_dir);
checkForExit();
console.log(` Processing the commit log for "${release.name}".`);
dataProcessor._processReleaseLog(release.name, releaseLog);
checkForExit();
}
}
// Fourth, we consolidate the information. Commits are populated with links to their
// respective PRs, and PRs store references to their commits. We will save this to
// a file for the specified range, which should be between two stable releases.
//
// For intermediate releases (developer previews) we have preconfigured hashes and
// can simply pass them to the final data. Frontend will handle the rest.
// This method returns only non-merge commits; we don't need to fetch anything about
// merge commits. We only need them for a complete commit history.
const commitHashes = dataProcessor.getCommitHashes();
console.log(`[*] Processing ${Object.keys(commitsRaw).length} commits.`);
dataProcessor.processCommits(commitsRaw, `${dataIO.data_owner}/${dataIO.data_repo}`);
checkForExit();
if (dataIO.skip_github) {
console.log(`[*] Skipping the commit data fetching from GitHub.`);
dataProcessor.consumeOldCommits();
} else {
// Third, we generate a query to the GraphQL API to fetch the information about
// linked PRs. GraphQL API doesn't have a filter to extract data for a list of
// commit hashes, but it supports having multiple sub-queries within the same request,
// which is our way in.
//
// While paginated queries are limited to 100 entries per page, sub-queries do not
// appear to be similarly limited. We are still limited by the total number of nodes
// we can theoretically fetch, which is 500 000. As such, we still want to do this
// in batches, so the number of nodes in each request is manageable.
console.log("[*] Fetching commit data from GitHub.");
let commitsRaw = {};
const totalPages = Math.ceil(commitHashes.length / COMMITS_PER_PAGE);
// Pages are starting with 1 for better presentation.
let page = 1;
while (page <= totalPages) {
const batchHashes = commitHashes.splice(0, COMMITS_PER_PAGE);
const batchCommits = await dataFetcher.fetchCommits(batchHashes, page, totalPages);
checkForExit();
Object.assign(commitsRaw, batchCommits);
page++;
// Wait for a bit before proceeding to avoid hitting the secondary rate limit in GitHub API.
// See https://docs.github.com/en/rest/guides/best-practices-for-integrators#dealing-with-secondary-rate-limits.
await dataFetcher.delay(API_DELAY_MSEC);
// Add an extra delay every few requests, because the chance to trigger the hidden rate issue
// seems to grow with the number of queries.
if (page % 8 === 0) {
console.log("[*] Waiting a bit for the API to cool down...");
await dataFetcher.delay(API_DELAY_MSEC * 4);
}
}
// Fourth, we consolidate the information. Commits are populated with links to their
// respective PRs, and PRs store references to their commits. We will save this to
// a file for the specified range, which should be between two stable releases.
//
// For intermediate releases (developer previews) we have preconfigured hashes and
// can simply pass them to the final data. Frontend will handle the rest.
console.log(`[*] Processing ${Object.keys(commitsRaw).length} commits.`);
dataProcessor.processCommits(commitsRaw, `${dataIO.data_owner}/${dataIO.data_repo}`);
checkForExit();
}
console.log("[*] Checking the rate limits after.")
await dataFetcher.checkRates();
@@ -882,12 +1022,13 @@ async function main() {
const output = {
"generated_at": Date.now(),
"log": dataProcessor.log,
"release_logs": dataProcessor.releaseLogs,
"authors": dataProcessor.authors,
"commits": dataProcessor.commits,
"pulls": dataProcessor.pulls,
};
await dataIO.saveData(output, `${dataIO.data_owner}.${dataIO.data_repo}.${dataIO.data_version}.json`);
await dataIO.saveData(databaseName, output);
checkForExit();
console.log("[*] Database built.");