mirror of
https://github.com/godotengine/godot-interactive-changelog.git
synced 2025-12-31 01:49:28 +03:00
Implement fetching and parsing of the commit log
- Validate the results with a size check.
This commit is contained in:
323
compose-db.js
323
compose-db.js
@@ -1,10 +1,18 @@
|
||||
const fs = require('fs').promises;
|
||||
const fsConstants = require('fs').constants;
|
||||
const nodeUtil = require('util');
|
||||
const fetch = require('node-fetch');
|
||||
const exec = nodeUtil.promisify(require('child_process').exec);
|
||||
|
||||
const ExitCodes = {
|
||||
"RequestFailure": 1,
|
||||
"ParseFailure": 2,
|
||||
"ExecFailure": 3,
|
||||
};
|
||||
|
||||
const LogFormat = {
|
||||
"Raw": 0,
|
||||
"JSON": 1,
|
||||
};
|
||||
|
||||
const ITEMS_PER_PAGE = 100;
|
||||
@@ -17,8 +25,17 @@ const API_RATE_LIMIT = `
|
||||
}
|
||||
`;
|
||||
|
||||
const GIT_HEAD_COMMIT_RE = RegExp("^commit ([a-zA-Z0-9-_]+)$");
|
||||
const GIT_HEAD_AUTHOR_RE = RegExp("^Author: (.+)$");
|
||||
const GIT_HEAD_COMMITTER_RE = RegExp("^Commit: (.+)$");
|
||||
const GIT_BODY_LINE_RE = RegExp("^[\s]{2,}(.*)$");
|
||||
const GIT_BODY_CHERRYPICK_RE = RegExp("^[\s]{2,}\(cherry picked from commit ([a-zA-Z0-9-_]+)\)$");
|
||||
|
||||
class DataFetcher {
|
||||
constructor(data_owner, data_repo) {
|
||||
this.data_owner = data_owner;
|
||||
this.data_repo = data_repo;
|
||||
|
||||
this.repo_ssh_path = `git@github.com:${data_owner}/${data_repo}.git`;
|
||||
this.api_rest_path = `https://api.github.com/repos/${data_owner}/${data_repo}`;
|
||||
this.api_repository_id = `owner:"${data_owner}" name:"${data_repo}"`;
|
||||
@@ -27,15 +44,19 @@ class DataFetcher {
|
||||
this.last_cursor = "";
|
||||
}
|
||||
|
||||
async _logResponse(data, name) {
|
||||
async _logResponse(data, name, format = LogFormat.JSON) {
|
||||
try {
|
||||
try {
|
||||
await fs.access("logs", fsConstants.R_OK | fsConstants.W_OK);
|
||||
} catch (err) {
|
||||
await fs.mkdir("logs");
|
||||
await ensureDir("./logs");
|
||||
|
||||
let filename = `./logs/${name}`;
|
||||
let fileContent = "" + data;
|
||||
|
||||
if (format === LogFormat.JSON) {
|
||||
filename = `./logs/${name}.json`;
|
||||
fileContent = JSON.stringify(data, null, 4);
|
||||
}
|
||||
|
||||
await fs.writeFile(`logs/${name}.json`, JSON.stringify(data, null, 4), {encoding: "utf-8"});
|
||||
|
||||
await fs.writeFile(filename, fileContent, {encoding: "utf-8"});
|
||||
} catch (err) {
|
||||
console.error("Error saving log file: " + err);
|
||||
}
|
||||
@@ -61,11 +82,46 @@ class DataFetcher {
|
||||
}
|
||||
|
||||
async checkoutRepo(atCommit) {
|
||||
try {
|
||||
// Make sure that the temp folder exists and is empty.
|
||||
await ensureDir("./temp");
|
||||
await clearDir("./temp");
|
||||
|
||||
// Checkout a shallow clone of the repository; we are only interested in its history.
|
||||
await exec(`git clone --filter=tree:0 --branch ${atCommit} --single-branch ${this.repo_ssh_path}`, { cwd: "./temp" });
|
||||
} catch (err) {
|
||||
console.error(" Error checking out a copy of the target repository: " + err);
|
||||
process.exitCode = ExitCodes.ExecFailure;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
getCommitHistory(fromCommit, toCommit) {
|
||||
async countCommitHistory(fromCommit, toCommit) {
|
||||
try {
|
||||
const { stdout, stderr } = await exec(`git log --pretty=oneline --no-merges ${fromCommit}..${toCommit}`, { cwd: `./temp/${this.data_repo}` });
|
||||
|
||||
const commitHistory = stdout.trimEnd();
|
||||
await this._logResponse(commitHistory, "_commit_shortlog", LogFormat.Raw);
|
||||
return commitHistory.split("\n").length;
|
||||
} catch (err) {
|
||||
console.error(" Error extracting the commit history: " + err);
|
||||
process.exitCode = ExitCodes.ExecFailure;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
async getCommitHistory(fromCommit, toCommit) {
|
||||
try {
|
||||
const { stdout, stderr } = await exec(`git log --pretty=full --no-merges ${fromCommit}..${toCommit}`, { cwd: `./temp/${this.data_repo}` });
|
||||
|
||||
const commitHistory = stdout;
|
||||
await this._logResponse(commitHistory, "_commit_history", LogFormat.Raw);
|
||||
return commitHistory;
|
||||
} catch (err) {
|
||||
console.error(" Error extracting the commit history: " + err);
|
||||
process.exitCode = ExitCodes.ExecFailure;
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
async fetchGithub(query) {
|
||||
@@ -138,10 +194,14 @@ class DataFetcher {
|
||||
|
||||
messageHeadline
|
||||
messageBody
|
||||
|
||||
author {
|
||||
date
|
||||
email
|
||||
name
|
||||
user {
|
||||
login
|
||||
avatarUrl
|
||||
url
|
||||
id
|
||||
}
|
||||
}
|
||||
|
||||
associatedPullRequests (first: 100) {
|
||||
@@ -185,19 +245,19 @@ class DataFetcher {
|
||||
`
|
||||
}
|
||||
|
||||
async fetchCommits(commits) {
|
||||
async fetchCommits(commitHashes) {
|
||||
try {
|
||||
const query = `
|
||||
query {
|
||||
${API_RATE_LIMIT}
|
||||
|
||||
${commits.map((item) => {
|
||||
${commitHashes.map((item) => {
|
||||
return this._getCommitQuery(item) + "\n";
|
||||
})}
|
||||
}
|
||||
`;
|
||||
|
||||
console.log(` Requesting a batch of ${commits.length} commits.`);
|
||||
console.log(` Requesting a batch of ${commitHashes.length} commits.`);
|
||||
|
||||
const res = await this.fetchGithub(query);
|
||||
if (res.status !== 200) {
|
||||
@@ -233,12 +293,135 @@ class DataFetcher {
|
||||
class DataProcessor {
|
||||
constructor() {
|
||||
this.authors = {};
|
||||
this.commits = {};
|
||||
this.pulls = [];
|
||||
}
|
||||
|
||||
processLog(logRaw, logSize) {
|
||||
// Parse the log, given in its "full" format. Records are presented in
|
||||
// the chronological order, line by line, with each record spanning across
|
||||
// several lines.
|
||||
// The general format for each record is as follows:
|
||||
//
|
||||
// commit COMMIT_HASH
|
||||
// Author: AUTHOR_NAME <AUTHOR_EMAIL>
|
||||
// Commit: COMMITTER_NAME <COMMITTER_EMAIL>
|
||||
//
|
||||
// MESSAGE_HEADER
|
||||
//
|
||||
// MESSAGE_BODY_MULTILINE
|
||||
//
|
||||
// The last line of the body can also be as follows, for cherry-picked commits:
|
||||
//
|
||||
// (cherry picked from commit ORIGINAL_COMMIT_HASH)
|
||||
//
|
||||
|
||||
// The most straightforward way to parse this format is to go line by line and check
|
||||
// if we reach one of the metadata lines.
|
||||
let logLines = logRaw.split("\n");
|
||||
let commit = null;
|
||||
|
||||
while (logLines.length > 0) {
|
||||
const line = logLines.shift();
|
||||
|
||||
// Check if the file starts with the first commit record.
|
||||
if (commit == null && !GIT_HEAD_COMMIT_RE.test(line)) {
|
||||
console.error(" Error parsing commit log: Invalid format.");
|
||||
process.exitCode = ExitCodes.ParseFailure;
|
||||
break;
|
||||
}
|
||||
|
||||
// Start parsing a new commit; store the existing one if applicable.
|
||||
let matches = line.match(GIT_HEAD_COMMIT_RE);
|
||||
if (matches) {
|
||||
if (commit != null) {
|
||||
this.commits[commit.hash] = commit;
|
||||
}
|
||||
|
||||
commit = {
|
||||
"hash": matches[1],
|
||||
"author": "",
|
||||
"committer": "",
|
||||
|
||||
"summary": "",
|
||||
"body": "",
|
||||
|
||||
"is_cherrypick": false,
|
||||
"cherrypick_hash": "",
|
||||
};
|
||||
continue;
|
||||
}
|
||||
|
||||
// Parse the authorship information.
|
||||
matches = line.match(GIT_HEAD_AUTHOR_RE);
|
||||
if (matches) {
|
||||
commit.author = matches[1];
|
||||
continue;
|
||||
}
|
||||
matches = line.match(GIT_HEAD_COMMITTER_RE);
|
||||
if (matches) {
|
||||
commit.committer = matches[1];
|
||||
continue;
|
||||
}
|
||||
|
||||
// By this point we should have the entire header, or we're broken.
|
||||
if (commit.hash === "" || commit.author === "" || commit.committer === "") {
|
||||
console.error(" Error parsing commit log: Invalid format.");
|
||||
process.exitCode = ExitCodes.ParseFailure;
|
||||
break;
|
||||
}
|
||||
|
||||
// Start parsing the body.
|
||||
matches = line.match(GIT_BODY_LINE_RE);
|
||||
|
||||
// Look for the first line of the commit message, it's our summary.
|
||||
if (commit.summary === "") {
|
||||
if (!matches) {
|
||||
continue;
|
||||
}
|
||||
|
||||
commit.summary = matches[1];
|
||||
continue;
|
||||
}
|
||||
|
||||
// Treat as an empty line.
|
||||
if (!matches) {
|
||||
commit.body += "\n";
|
||||
continue;
|
||||
}
|
||||
// Use the catch group to strip leading spaces.
|
||||
commit.body += `${matches[1]}\n`;
|
||||
|
||||
// Check if this is a cherry-pick.
|
||||
matches = line.match(GIT_BODY_CHERRYPICK_RE);
|
||||
if (matches) {
|
||||
commit.is_cherrypick = true;
|
||||
commit.cherrypick_hash = matches[1];
|
||||
}
|
||||
}
|
||||
|
||||
// Store the last commit.
|
||||
if (commit != null) {
|
||||
this.commits[commit.hash] = commit;
|
||||
}
|
||||
|
||||
let commitHashes = Object.keys(this.commits);
|
||||
if (commitHashes.length !== logSize) {
|
||||
console.error(` Error parsing commit log: Expected to received ${logSize} commits, but got ${commitHashes.length} instead.`);
|
||||
process.exitCode = ExitCodes.ParseFailure;
|
||||
}
|
||||
|
||||
return commitHashes;
|
||||
}
|
||||
|
||||
processCommits(commitsRaw) {
|
||||
try {
|
||||
commitsRaw.forEach((item) => {
|
||||
// Commits can have multiple PRs associated with them, so we need to be on the lookout
|
||||
// for rogue entries. Normally, it will always be one pull per commit (except for direct
|
||||
// commits, which will have none), but GitHub may sometimes link commits to PRs in other
|
||||
// repos/otherwise unrelated. So some form of filtering is required.
|
||||
|
||||
const pullsRaw = mapNodes(item.associatedPullRequests);
|
||||
const pullItem = pullsRaw[0];
|
||||
|
||||
@@ -310,26 +493,56 @@ function mapNodes(object) {
|
||||
return object.edges.map((item) => item["node"])
|
||||
}
|
||||
|
||||
async function ensureDir(dirPath) {
|
||||
try {
|
||||
await fs.access(dirPath, fsConstants.R_OK | fsConstants.W_OK);
|
||||
} catch (err) {
|
||||
await fs.mkdir(dirPath);
|
||||
}
|
||||
}
|
||||
|
||||
async function clearDir(rootPath) {
|
||||
try {
|
||||
const pathStat = await fs.stat(rootPath);
|
||||
if (!pathStat.isDirectory()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const removeDir = async (dirPath) => {
|
||||
const dirFiles = await fs.readdir(dirPath);
|
||||
for (let entryName of dirFiles) {
|
||||
if (entryName === "." || entryName === "..") {
|
||||
continue;
|
||||
}
|
||||
|
||||
const entryPath = `${dirPath}/${entryName}`;
|
||||
const entryStat = await fs.stat(entryPath);
|
||||
if (entryStat.isDirectory()) {
|
||||
await removeDir(entryPath);
|
||||
await fs.rmdir(entryPath);
|
||||
}
|
||||
else if (entryStat.isFile()) {
|
||||
await fs.unlink(entryPath);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
await removeDir(rootPath);
|
||||
} catch (err) {
|
||||
// ..
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
// Internal utility methods.
|
||||
const ensureDir = async (dirPath) => {
|
||||
try {
|
||||
const pathStat = await fs.stat(dirPath);
|
||||
if (!pathStat.isDirectory()) {
|
||||
await fs.mkdir(dirPath);
|
||||
}
|
||||
} catch (err) {
|
||||
await fs.mkdir(dirPath);
|
||||
}
|
||||
}
|
||||
const checkForExit = () => {
|
||||
if (process.exitCode > 0) {
|
||||
process.exit();
|
||||
}
|
||||
}
|
||||
};
|
||||
const delay = async (msec) => {
|
||||
return new Promise(resolve => setTimeout(resolve, msec));
|
||||
}
|
||||
};
|
||||
|
||||
// Getting PRs between two commits is a complicated task, and must be done in
|
||||
// multiple steps. GitHub API does not have a method for that, so we must improvise.
|
||||
@@ -344,8 +557,14 @@ async function main() {
|
||||
|
||||
console.log("[*] Building local pull request database.");
|
||||
|
||||
// Configurable properties.
|
||||
let data_owner = "godotengine";
|
||||
let data_repo = "godot";
|
||||
let first_commit = "4.0-stable"
|
||||
let last_commit = "4.0.1-stable";
|
||||
|
||||
let skip_checkout = false;
|
||||
|
||||
process.argv.forEach((arg) => {
|
||||
if (arg.indexOf("owner:") === 0) {
|
||||
data_owner = arg.substring(6);
|
||||
@@ -353,6 +572,10 @@ async function main() {
|
||||
if (arg.indexOf("repo:") === 0) {
|
||||
data_repo = arg.substring(5);
|
||||
}
|
||||
|
||||
if (arg === "skip-checkout") {
|
||||
skip_checkout = true;
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`[*] Configured for the "${data_owner}/${data_repo}" repository.`);
|
||||
@@ -364,17 +587,29 @@ async function main() {
|
||||
checkForExit();
|
||||
|
||||
// First, we checkout the repository for the specified branch/tag/hash. We will
|
||||
// use it to retrieve a clean commit log, ignoring merge commits. We can also use
|
||||
// it as a basis for our list of authors/contributors, as it's not always the
|
||||
// same between the PR and the actual commit.
|
||||
// use it to retrieve a clean commit log, ignoring merge commits. This step creates
|
||||
// as shallow copy, as we are only interested in the history of the branch.
|
||||
// Still, it extracts all of the current files, so it may take a bit of time.
|
||||
|
||||
await ensureDir("./temp");
|
||||
if (!skip_checkout) {
|
||||
console.log(`[*] Checking out the repository at "${last_commit}".`);
|
||||
await dataFetcher.checkoutRepo(last_commit);
|
||||
checkForExit();
|
||||
}
|
||||
|
||||
console.log(`[*] Extracting the commit log between "${first_commit}" and "${last_commit}".`);
|
||||
const commitLogSize = await dataFetcher.countCommitHistory(first_commit, last_commit);
|
||||
const commitLog = await dataFetcher.getCommitHistory(first_commit, last_commit);
|
||||
checkForExit();
|
||||
|
||||
// Second, we parse the extracted commit log, to generate a list of commit hashes
|
||||
// for the next step. We also try to extract the information about this being a
|
||||
// cherry-pick, and not the original commit. We can rely on the commit message body
|
||||
// containing a certain string, from which we can take the original commit hash.
|
||||
|
||||
const commitHashes = dataProcessor.processLog(commitLog, commitLogSize);
|
||||
checkForExit();
|
||||
|
||||
// Second, we try to extract information about this being a cherry-pick. We can
|
||||
// rely on the commit message body containing a certain string, from which we can
|
||||
// take the original commit hash.
|
||||
//
|
||||
// Third, we generate a query to the GraphQL API to fetch the information about
|
||||
// linked PRs. GraphQL API supports having multiple sub-queries, which can be our
|
||||
// gateway to fetching the data for a list of specific hashes.
|
||||
@@ -383,25 +618,12 @@ async function main() {
|
||||
// It's also unclear whether this feature is limited to a certain number of subqueries
|
||||
// (say, 100), or not. We may need to do it in batches, as we do with paginated
|
||||
// queries.
|
||||
//
|
||||
// Fourth, we consolidate the information. Each run is performed on a certain range
|
||||
// of branches/tags/hashes, and so we store the information we receive in files
|
||||
// associated with this range. This process can be optimized by only working with
|
||||
// smaller ranges, and composing bigger ranges out of them (e.g. using hashes for
|
||||
// X.Y beta 1 and X.Y beta 2, and then X.Y beta 2 and X.Y beta 3, and then generating
|
||||
// a complete list for X.Y-1 and X.Y on the frontend).
|
||||
|
||||
// Commits can have multiple PRs associated with them, so we need to be on the lookout
|
||||
// for rogue entries. Normally, it will always be one pull per commit (except for direct
|
||||
// commits, which will have none), but GitHub may sometimes link commits to PRs in other
|
||||
// repos/otherwise unrelated. So some form of filtering is required.
|
||||
|
||||
console.log("[*] Fetching commit data from GitHub.");
|
||||
// Pages are starting with 1 for better presentation.
|
||||
let page = 1;
|
||||
while (page <= dataFetcher.page_count) {
|
||||
//const commitsRaw = await dataFetcher.fetchCommits(page);
|
||||
//dataProcessor.processCommits(commitsRaw);
|
||||
//checkForExit();
|
||||
page++;
|
||||
|
||||
@@ -410,6 +632,13 @@ async function main() {
|
||||
await delay(1500);
|
||||
}
|
||||
|
||||
// Fourth, we consolidate the information. Each run is performed on a certain range
|
||||
// of branches/tags/hashes, and so we store the information we receive in files
|
||||
// associated with this range. This process can be optimized by only working with
|
||||
// smaller ranges, and composing bigger ranges out of them (e.g. using hashes for
|
||||
// X.Y beta 1 and X.Y beta 2, and then X.Y beta 2 and X.Y beta 3, and then generating
|
||||
// a complete list for X.Y-1 and X.Y on the frontend).
|
||||
|
||||
console.log("[*] Checking the rate limits after.")
|
||||
await dataFetcher.checkRates();
|
||||
checkForExit();
|
||||
|
||||
Reference in New Issue
Block a user