From 2fbcbca9b7c78fd73f00e9a442b937364ecfe534 Mon Sep 17 00:00:00 2001 From: Yuri Sizov Date: Tue, 21 Mar 2023 22:35:40 +0100 Subject: [PATCH] Implement fetching and parsing of the commit log - Validate the results with a size check. --- .gitignore | 1 + compose-db.js | 323 ++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 277 insertions(+), 47 deletions(-) diff --git a/.gitignore b/.gitignore index f40a54f..5a70d9b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Project folders. node_modules/ out/ +temp/ logs/ # Development environments. diff --git a/compose-db.js b/compose-db.js index c4e78e8..f8525b2 100644 --- a/compose-db.js +++ b/compose-db.js @@ -1,10 +1,18 @@ const fs = require('fs').promises; const fsConstants = require('fs').constants; +const nodeUtil = require('util'); const fetch = require('node-fetch'); +const exec = nodeUtil.promisify(require('child_process').exec); const ExitCodes = { "RequestFailure": 1, "ParseFailure": 2, + "ExecFailure": 3, +}; + +const LogFormat = { + "Raw": 0, + "JSON": 1, }; const ITEMS_PER_PAGE = 100; @@ -17,8 +25,17 @@ const API_RATE_LIMIT = ` } `; +const GIT_HEAD_COMMIT_RE = RegExp("^commit ([a-zA-Z0-9-_]+)$"); +const GIT_HEAD_AUTHOR_RE = RegExp("^Author: (.+)$"); +const GIT_HEAD_COMMITTER_RE = RegExp("^Commit: (.+)$"); +const GIT_BODY_LINE_RE = RegExp("^[\s]{2,}(.*)$"); +const GIT_BODY_CHERRYPICK_RE = RegExp("^[\s]{2,}\(cherry picked from commit ([a-zA-Z0-9-_]+)\)$"); + class DataFetcher { constructor(data_owner, data_repo) { + this.data_owner = data_owner; + this.data_repo = data_repo; + this.repo_ssh_path = `git@github.com:${data_owner}/${data_repo}.git`; this.api_rest_path = `https://api.github.com/repos/${data_owner}/${data_repo}`; this.api_repository_id = `owner:"${data_owner}" name:"${data_repo}"`; @@ -27,15 +44,19 @@ class DataFetcher { this.last_cursor = ""; } - async _logResponse(data, name) { + async _logResponse(data, name, format = LogFormat.JSON) { try { - try { - await fs.access("logs", fsConstants.R_OK | fsConstants.W_OK); - } catch (err) { - await fs.mkdir("logs"); + await ensureDir("./logs"); + + let filename = `./logs/${name}`; + let fileContent = "" + data; + + if (format === LogFormat.JSON) { + filename = `./logs/${name}.json`; + fileContent = JSON.stringify(data, null, 4); } - - await fs.writeFile(`logs/${name}.json`, JSON.stringify(data, null, 4), {encoding: "utf-8"}); + + await fs.writeFile(filename, fileContent, {encoding: "utf-8"}); } catch (err) { console.error("Error saving log file: " + err); } @@ -61,11 +82,46 @@ class DataFetcher { } async checkoutRepo(atCommit) { + try { + // Make sure that the temp folder exists and is empty. + await ensureDir("./temp"); + await clearDir("./temp"); + // Checkout a shallow clone of the repository; we are only interested in its history. + await exec(`git clone --filter=tree:0 --branch ${atCommit} --single-branch ${this.repo_ssh_path}`, { cwd: "./temp" }); + } catch (err) { + console.error(" Error checking out a copy of the target repository: " + err); + process.exitCode = ExitCodes.ExecFailure; + return; + } } - getCommitHistory(fromCommit, toCommit) { + async countCommitHistory(fromCommit, toCommit) { + try { + const { stdout, stderr } = await exec(`git log --pretty=oneline --no-merges ${fromCommit}..${toCommit}`, { cwd: `./temp/${this.data_repo}` }); + const commitHistory = stdout.trimEnd(); + await this._logResponse(commitHistory, "_commit_shortlog", LogFormat.Raw); + return commitHistory.split("\n").length; + } catch (err) { + console.error(" Error extracting the commit history: " + err); + process.exitCode = ExitCodes.ExecFailure; + return 0; + } + } + + async getCommitHistory(fromCommit, toCommit) { + try { + const { stdout, stderr } = await exec(`git log --pretty=full --no-merges ${fromCommit}..${toCommit}`, { cwd: `./temp/${this.data_repo}` }); + + const commitHistory = stdout; + await this._logResponse(commitHistory, "_commit_history", LogFormat.Raw); + return commitHistory; + } catch (err) { + console.error(" Error extracting the commit history: " + err); + process.exitCode = ExitCodes.ExecFailure; + return ""; + } } async fetchGithub(query) { @@ -138,10 +194,14 @@ class DataFetcher { messageHeadline messageBody + author { - date - email - name + user { + login + avatarUrl + url + id + } } associatedPullRequests (first: 100) { @@ -185,19 +245,19 @@ class DataFetcher { ` } - async fetchCommits(commits) { + async fetchCommits(commitHashes) { try { const query = ` query { ${API_RATE_LIMIT} - ${commits.map((item) => { + ${commitHashes.map((item) => { return this._getCommitQuery(item) + "\n"; })} } `; - console.log(` Requesting a batch of ${commits.length} commits.`); + console.log(` Requesting a batch of ${commitHashes.length} commits.`); const res = await this.fetchGithub(query); if (res.status !== 200) { @@ -233,12 +293,135 @@ class DataFetcher { class DataProcessor { constructor() { this.authors = {}; + this.commits = {}; this.pulls = []; } + processLog(logRaw, logSize) { + // Parse the log, given in its "full" format. Records are presented in + // the chronological order, line by line, with each record spanning across + // several lines. + // The general format for each record is as follows: + // + // commit COMMIT_HASH + // Author: AUTHOR_NAME + // Commit: COMMITTER_NAME + // + // MESSAGE_HEADER + // + // MESSAGE_BODY_MULTILINE + // + // The last line of the body can also be as follows, for cherry-picked commits: + // + // (cherry picked from commit ORIGINAL_COMMIT_HASH) + // + + // The most straightforward way to parse this format is to go line by line and check + // if we reach one of the metadata lines. + let logLines = logRaw.split("\n"); + let commit = null; + + while (logLines.length > 0) { + const line = logLines.shift(); + + // Check if the file starts with the first commit record. + if (commit == null && !GIT_HEAD_COMMIT_RE.test(line)) { + console.error(" Error parsing commit log: Invalid format."); + process.exitCode = ExitCodes.ParseFailure; + break; + } + + // Start parsing a new commit; store the existing one if applicable. + let matches = line.match(GIT_HEAD_COMMIT_RE); + if (matches) { + if (commit != null) { + this.commits[commit.hash] = commit; + } + + commit = { + "hash": matches[1], + "author": "", + "committer": "", + + "summary": "", + "body": "", + + "is_cherrypick": false, + "cherrypick_hash": "", + }; + continue; + } + + // Parse the authorship information. + matches = line.match(GIT_HEAD_AUTHOR_RE); + if (matches) { + commit.author = matches[1]; + continue; + } + matches = line.match(GIT_HEAD_COMMITTER_RE); + if (matches) { + commit.committer = matches[1]; + continue; + } + + // By this point we should have the entire header, or we're broken. + if (commit.hash === "" || commit.author === "" || commit.committer === "") { + console.error(" Error parsing commit log: Invalid format."); + process.exitCode = ExitCodes.ParseFailure; + break; + } + + // Start parsing the body. + matches = line.match(GIT_BODY_LINE_RE); + + // Look for the first line of the commit message, it's our summary. + if (commit.summary === "") { + if (!matches) { + continue; + } + + commit.summary = matches[1]; + continue; + } + + // Treat as an empty line. + if (!matches) { + commit.body += "\n"; + continue; + } + // Use the catch group to strip leading spaces. + commit.body += `${matches[1]}\n`; + + // Check if this is a cherry-pick. + matches = line.match(GIT_BODY_CHERRYPICK_RE); + if (matches) { + commit.is_cherrypick = true; + commit.cherrypick_hash = matches[1]; + } + } + + // Store the last commit. + if (commit != null) { + this.commits[commit.hash] = commit; + } + + let commitHashes = Object.keys(this.commits); + if (commitHashes.length !== logSize) { + console.error(` Error parsing commit log: Expected to received ${logSize} commits, but got ${commitHashes.length} instead.`); + process.exitCode = ExitCodes.ParseFailure; + } + + return commitHashes; + } + processCommits(commitsRaw) { try { commitsRaw.forEach((item) => { + // Commits can have multiple PRs associated with them, so we need to be on the lookout + // for rogue entries. Normally, it will always be one pull per commit (except for direct + // commits, which will have none), but GitHub may sometimes link commits to PRs in other + // repos/otherwise unrelated. So some form of filtering is required. + const pullsRaw = mapNodes(item.associatedPullRequests); const pullItem = pullsRaw[0]; @@ -310,26 +493,56 @@ function mapNodes(object) { return object.edges.map((item) => item["node"]) } +async function ensureDir(dirPath) { + try { + await fs.access(dirPath, fsConstants.R_OK | fsConstants.W_OK); + } catch (err) { + await fs.mkdir(dirPath); + } +} + +async function clearDir(rootPath) { + try { + const pathStat = await fs.stat(rootPath); + if (!pathStat.isDirectory()) { + return; + } + + const removeDir = async (dirPath) => { + const dirFiles = await fs.readdir(dirPath); + for (let entryName of dirFiles) { + if (entryName === "." || entryName === "..") { + continue; + } + + const entryPath = `${dirPath}/${entryName}`; + const entryStat = await fs.stat(entryPath); + if (entryStat.isDirectory()) { + await removeDir(entryPath); + await fs.rmdir(entryPath); + } + else if (entryStat.isFile()) { + await fs.unlink(entryPath); + } + } + }; + + await removeDir(rootPath); + } catch (err) { + // .. + } +} + async function main() { // Internal utility methods. - const ensureDir = async (dirPath) => { - try { - const pathStat = await fs.stat(dirPath); - if (!pathStat.isDirectory()) { - await fs.mkdir(dirPath); - } - } catch (err) { - await fs.mkdir(dirPath); - } - } const checkForExit = () => { if (process.exitCode > 0) { process.exit(); } - } + }; const delay = async (msec) => { return new Promise(resolve => setTimeout(resolve, msec)); - } + }; // Getting PRs between two commits is a complicated task, and must be done in // multiple steps. GitHub API does not have a method for that, so we must improvise. @@ -344,8 +557,14 @@ async function main() { console.log("[*] Building local pull request database."); + // Configurable properties. let data_owner = "godotengine"; let data_repo = "godot"; + let first_commit = "4.0-stable" + let last_commit = "4.0.1-stable"; + + let skip_checkout = false; + process.argv.forEach((arg) => { if (arg.indexOf("owner:") === 0) { data_owner = arg.substring(6); @@ -353,6 +572,10 @@ async function main() { if (arg.indexOf("repo:") === 0) { data_repo = arg.substring(5); } + + if (arg === "skip-checkout") { + skip_checkout = true; + } }); console.log(`[*] Configured for the "${data_owner}/${data_repo}" repository.`); @@ -364,17 +587,29 @@ async function main() { checkForExit(); // First, we checkout the repository for the specified branch/tag/hash. We will - // use it to retrieve a clean commit log, ignoring merge commits. We can also use - // it as a basis for our list of authors/contributors, as it's not always the - // same between the PR and the actual commit. + // use it to retrieve a clean commit log, ignoring merge commits. This step creates + // as shallow copy, as we are only interested in the history of the branch. + // Still, it extracts all of the current files, so it may take a bit of time. - await ensureDir("./temp"); + if (!skip_checkout) { + console.log(`[*] Checking out the repository at "${last_commit}".`); + await dataFetcher.checkoutRepo(last_commit); + checkForExit(); + } + console.log(`[*] Extracting the commit log between "${first_commit}" and "${last_commit}".`); + const commitLogSize = await dataFetcher.countCommitHistory(first_commit, last_commit); + const commitLog = await dataFetcher.getCommitHistory(first_commit, last_commit); + checkForExit(); + + // Second, we parse the extracted commit log, to generate a list of commit hashes + // for the next step. We also try to extract the information about this being a + // cherry-pick, and not the original commit. We can rely on the commit message body + // containing a certain string, from which we can take the original commit hash. + + const commitHashes = dataProcessor.processLog(commitLog, commitLogSize); + checkForExit(); - // Second, we try to extract information about this being a cherry-pick. We can - // rely on the commit message body containing a certain string, from which we can - // take the original commit hash. - // // Third, we generate a query to the GraphQL API to fetch the information about // linked PRs. GraphQL API supports having multiple sub-queries, which can be our // gateway to fetching the data for a list of specific hashes. @@ -383,25 +618,12 @@ async function main() { // It's also unclear whether this feature is limited to a certain number of subqueries // (say, 100), or not. We may need to do it in batches, as we do with paginated // queries. - // - // Fourth, we consolidate the information. Each run is performed on a certain range - // of branches/tags/hashes, and so we store the information we receive in files - // associated with this range. This process can be optimized by only working with - // smaller ranges, and composing bigger ranges out of them (e.g. using hashes for - // X.Y beta 1 and X.Y beta 2, and then X.Y beta 2 and X.Y beta 3, and then generating - // a complete list for X.Y-1 and X.Y on the frontend). - - // Commits can have multiple PRs associated with them, so we need to be on the lookout - // for rogue entries. Normally, it will always be one pull per commit (except for direct - // commits, which will have none), but GitHub may sometimes link commits to PRs in other - // repos/otherwise unrelated. So some form of filtering is required. console.log("[*] Fetching commit data from GitHub."); // Pages are starting with 1 for better presentation. let page = 1; while (page <= dataFetcher.page_count) { //const commitsRaw = await dataFetcher.fetchCommits(page); - //dataProcessor.processCommits(commitsRaw); //checkForExit(); page++; @@ -410,6 +632,13 @@ async function main() { await delay(1500); } + // Fourth, we consolidate the information. Each run is performed on a certain range + // of branches/tags/hashes, and so we store the information we receive in files + // associated with this range. This process can be optimized by only working with + // smaller ranges, and composing bigger ranges out of them (e.g. using hashes for + // X.Y beta 1 and X.Y beta 2, and then X.Y beta 2 and X.Y beta 3, and then generating + // a complete list for X.Y-1 and X.Y on the frontend). + console.log("[*] Checking the rate limits after.") await dataFetcher.checkRates(); checkForExit();