From 2fbcbca9b7c78fd73f00e9a442b937364ecfe534 Mon Sep 17 00:00:00 2001
From: Yuri Sizov <yuris@humnom.net>
Date: Tue, 21 Mar 2023 22:35:40 +0100
Subject: [PATCH] Implement fetching and parsing of the commit log

- Validate the results with a size check.
---
 .gitignore    |   1 +
 compose-db.js | 323 ++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 277 insertions(+), 47 deletions(-)

diff --git a/.gitignore b/.gitignore
index f40a54f..5a70d9b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 # Project folders.
 node_modules/
 out/
+temp/
 logs/
 
 # Development environments.
diff --git a/compose-db.js b/compose-db.js
index c4e78e8..f8525b2 100644
--- a/compose-db.js
+++ b/compose-db.js
@@ -1,10 +1,18 @@
 const fs = require('fs').promises;
 const fsConstants = require('fs').constants;
+const nodeUtil = require('util');
 const fetch = require('node-fetch');
+const exec = nodeUtil.promisify(require('child_process').exec);
 
 const ExitCodes = {
     "RequestFailure": 1,
     "ParseFailure": 2,
+    "ExecFailure": 3,
+};
+
+const LogFormat = {
+    "Raw": 0,
+    "JSON": 1,
 };
 
 const ITEMS_PER_PAGE = 100;
@@ -17,8 +25,17 @@ const API_RATE_LIMIT = `
   }
 `;
 
+const GIT_HEAD_COMMIT_RE = RegExp("^commit ([a-zA-Z0-9-_]+)$");
+const GIT_HEAD_AUTHOR_RE = RegExp("^Author: (.+)$");
+const GIT_HEAD_COMMITTER_RE = RegExp("^Commit: (.+)$");
+const GIT_BODY_LINE_RE = RegExp("^[\s]{2,}(.*)$");
+const GIT_BODY_CHERRYPICK_RE = RegExp("^[\s]{2,}\(cherry picked from commit ([a-zA-Z0-9-_]+)\)$");
+
 class DataFetcher {
     constructor(data_owner, data_repo) {
+        this.data_owner = data_owner;
+        this.data_repo = data_repo;
+
         this.repo_ssh_path = `git@github.com:${data_owner}/${data_repo}.git`;
         this.api_rest_path = `https://api.github.com/repos/${data_owner}/${data_repo}`;
         this.api_repository_id = `owner:"${data_owner}" name:"${data_repo}"`;
@@ -27,15 +44,19 @@ class DataFetcher {
         this.last_cursor = "";
     }
 
-    async _logResponse(data, name) {
+    async _logResponse(data, name, format = LogFormat.JSON) {
         try {
-            try {
-                await fs.access("logs", fsConstants.R_OK | fsConstants.W_OK);
-            } catch (err) {
-                await fs.mkdir("logs");
+            await ensureDir("./logs");
+
+            let filename = `./logs/${name}`;
+            let fileContent = "" + data;
+
+            if (format === LogFormat.JSON) {
+                filename = `./logs/${name}.json`;
+                fileContent = JSON.stringify(data, null, 4);
             }
-    
-            await fs.writeFile(`logs/${name}.json`, JSON.stringify(data, null, 4), {encoding: "utf-8"});
+
+            await fs.writeFile(filename, fileContent, {encoding: "utf-8"});
         } catch (err) {
             console.error("Error saving log file: " + err);
         }
@@ -61,11 +82,46 @@ class DataFetcher {
     }
 
     async checkoutRepo(atCommit) {
+        try {
+            // Make sure that the temp folder exists and is empty.
+            await ensureDir("./temp");
+            await clearDir("./temp");
 
+            // Checkout a shallow clone of the repository; we are only interested in its history.
+            await exec(`git clone --filter=tree:0 --branch ${atCommit} --single-branch ${this.repo_ssh_path}`, { cwd: "./temp" });
+        } catch (err) {
+            console.error("    Error checking out a copy of the target repository: " + err);
+            process.exitCode = ExitCodes.ExecFailure;
+            return;
+        }
     }
 
-    getCommitHistory(fromCommit, toCommit) {
+    async countCommitHistory(fromCommit, toCommit) {
+        try {
+            const { stdout, stderr } = await exec(`git log --pretty=oneline --no-merges ${fromCommit}..${toCommit}`, { cwd: `./temp/${this.data_repo}` });
 
+            const commitHistory = stdout.trimEnd();
+            await this._logResponse(commitHistory, "_commit_shortlog", LogFormat.Raw);
+            return commitHistory.split("\n").length;
+        } catch (err) {
+            console.error("    Error extracting the commit history: " + err);
+            process.exitCode = ExitCodes.ExecFailure;
+            return 0;
+        }
+    }
+
+    async getCommitHistory(fromCommit, toCommit) {
+        try {
+            const { stdout, stderr } = await exec(`git log --pretty=full --no-merges ${fromCommit}..${toCommit}`, { cwd: `./temp/${this.data_repo}` });
+
+            const commitHistory = stdout;
+            await this._logResponse(commitHistory, "_commit_history", LogFormat.Raw);
+            return commitHistory;
+        } catch (err) {
+            console.error("    Error extracting the commit history: " + err);
+            process.exitCode = ExitCodes.ExecFailure;
+            return "";
+        }
     }
 
     async fetchGithub(query) {
@@ -138,10 +194,14 @@ class DataFetcher {
 
                 messageHeadline
                 messageBody
+
                 author {
-                  date
-                  email
-                  name
+                  user {
+                    login
+                    avatarUrl
+                    url
+                    id
+                  }
                 }
 
                 associatedPullRequests (first: 100) {
@@ -185,19 +245,19 @@ class DataFetcher {
         `
     }
 
-    async fetchCommits(commits) {
+    async fetchCommits(commitHashes) {
         try {
             const query = `
             query {
                 ${API_RATE_LIMIT}
 
-                ${commits.map((item) => {
+                ${commitHashes.map((item) => {
                     return this._getCommitQuery(item) + "\n";
                 })}
               }
             `;
 
-            console.log(`    Requesting a batch of ${commits.length} commits.`);
+            console.log(`    Requesting a batch of ${commitHashes.length} commits.`);
     
             const res = await this.fetchGithub(query);
             if (res.status !== 200) {
@@ -233,12 +293,135 @@ class DataFetcher {
 class DataProcessor {
     constructor() {
         this.authors = {};
+        this.commits = {};
         this.pulls = [];
     }
 
+    processLog(logRaw, logSize) {
+        // Parse the log, given in its "full" format. Records are presented in
+        // the chronological order, line by line, with each record spanning across
+        // several lines.
+        // The general format for each record is as follows:
+        //
+        // commit COMMIT_HASH
+        // Author: AUTHOR_NAME <AUTHOR_EMAIL>
+        // Commit: COMMITTER_NAME <COMMITTER_EMAIL>
+        //
+        //     MESSAGE_HEADER
+        //
+        //     MESSAGE_BODY_MULTILINE
+        //
+        // The last line of the body can also be as follows, for cherry-picked commits:
+        //
+        //     (cherry picked from commit ORIGINAL_COMMIT_HASH)
+        //
+
+        // The most straightforward way to parse this format is to go line by line and check
+        // if we reach one of the metadata lines.
+        let logLines = logRaw.split("\n");
+        let commit = null;
+
+        while (logLines.length > 0) {
+            const line = logLines.shift();
+
+            // Check if the file starts with the first commit record.
+            if (commit == null && !GIT_HEAD_COMMIT_RE.test(line)) {
+                console.error("    Error parsing commit log: Invalid format.");
+                process.exitCode = ExitCodes.ParseFailure;
+                break;
+            }
+
+            // Start parsing a new commit; store the existing one if applicable.
+            let matches = line.match(GIT_HEAD_COMMIT_RE);
+            if (matches) {
+                if (commit != null) {
+                    this.commits[commit.hash] = commit;
+                }
+
+                commit = {
+                    "hash": matches[1],
+                    "author": "",
+                    "committer": "",
+                    
+                    "summary": "",
+                    "body": "",
+
+                    "is_cherrypick": false,
+                    "cherrypick_hash": "",
+                };
+                continue;
+            }
+
+            // Parse the authorship information.
+            matches = line.match(GIT_HEAD_AUTHOR_RE);
+            if (matches) {
+                commit.author = matches[1];
+                continue;
+            }
+            matches = line.match(GIT_HEAD_COMMITTER_RE);
+            if (matches) {
+                commit.committer = matches[1];
+                continue;
+            }
+
+            // By this point we should have the entire header, or we're broken.
+            if (commit.hash === "" || commit.author === "" || commit.committer === "") {
+                console.error("    Error parsing commit log: Invalid format.");
+                process.exitCode = ExitCodes.ParseFailure;
+                break;
+            }
+
+            // Start parsing the body.
+            matches = line.match(GIT_BODY_LINE_RE);
+
+            // Look for the first line of the commit message, it's our summary.
+            if (commit.summary === "") {
+                if (!matches) {
+                    continue;
+                }
+
+                commit.summary = matches[1];
+                continue;
+            }
+
+            // Treat as an empty line.
+            if (!matches) {
+                commit.body += "\n";
+                continue;
+            }
+            // Use the catch group to strip leading spaces.
+            commit.body += `${matches[1]}\n`;
+
+            // Check if this is a cherry-pick.
+            matches = line.match(GIT_BODY_CHERRYPICK_RE);
+            if (matches) {
+                commit.is_cherrypick = true;
+                commit.cherrypick_hash = matches[1];
+            }
+        }
+
+        // Store the last commit.
+        if (commit != null) {
+            this.commits[commit.hash] = commit;
+        }
+
+        let commitHashes = Object.keys(this.commits);
+        if (commitHashes.length !== logSize) {
+            console.error(`    Error parsing commit log: Expected to received ${logSize} commits, but got ${commitHashes.length} instead.`);
+            process.exitCode = ExitCodes.ParseFailure;
+        }
+
+        return commitHashes;
+    }
+
     processCommits(commitsRaw) {
         try {
             commitsRaw.forEach((item) => {
+                // Commits can have multiple PRs associated with them, so we need to be on the lookout
+                // for rogue entries. Normally, it will always be one pull per commit (except for direct
+                // commits, which will have none), but GitHub may sometimes link commits to PRs in other
+                // repos/otherwise unrelated. So some form of filtering is required.
+
                 const pullsRaw = mapNodes(item.associatedPullRequests);
                 const pullItem = pullsRaw[0];
 
@@ -310,26 +493,56 @@ function mapNodes(object) {
     return object.edges.map((item) => item["node"])
 }
 
+async function ensureDir(dirPath) {
+    try {
+        await fs.access(dirPath, fsConstants.R_OK | fsConstants.W_OK);
+    } catch (err) {
+        await fs.mkdir(dirPath);
+    }
+}
+
+async function clearDir(rootPath) {
+    try {
+        const pathStat = await fs.stat(rootPath);
+        if (!pathStat.isDirectory()) {
+            return;
+        }
+
+        const removeDir = async (dirPath) => {
+            const dirFiles = await fs.readdir(dirPath);
+            for (let entryName of dirFiles) {
+                if (entryName === "." || entryName === "..") {
+                    continue;
+                }
+
+                const entryPath = `${dirPath}/${entryName}`;
+                const entryStat = await fs.stat(entryPath);
+                if (entryStat.isDirectory()) {
+                    await removeDir(entryPath);
+                    await fs.rmdir(entryPath);
+                }
+                else if (entryStat.isFile()) {
+                    await fs.unlink(entryPath);
+                }
+            }
+        };
+
+        await removeDir(rootPath);
+    } catch (err) {
+        // ..
+    }
+}
+
 async function main() {
     // Internal utility methods.
-    const ensureDir = async (dirPath) => {
-        try {
-            const pathStat = await fs.stat(dirPath);
-            if (!pathStat.isDirectory()) {
-                await fs.mkdir(dirPath);
-            }
-        } catch (err) {
-            await fs.mkdir(dirPath);
-        }
-    }
     const checkForExit = () => {
         if (process.exitCode > 0) {
             process.exit();
         }
-    }
+    };
     const delay = async (msec) => {
         return new Promise(resolve => setTimeout(resolve, msec));
-    }
+    };
 
     // Getting PRs between two commits is a complicated task, and must be done in
     // multiple steps. GitHub API does not have a method for that, so we must improvise.
@@ -344,8 +557,14 @@ async function main() {
 
     console.log("[*] Building local pull request database.");
 
+    // Configurable properties.
     let data_owner = "godotengine";
     let data_repo = "godot";
+    let first_commit = "4.0-stable"
+    let last_commit = "4.0.1-stable";
+
+    let skip_checkout = false;
+
     process.argv.forEach((arg) => {
         if (arg.indexOf("owner:") === 0) {
             data_owner = arg.substring(6);
@@ -353,6 +572,10 @@ async function main() {
         if (arg.indexOf("repo:") === 0) {
             data_repo = arg.substring(5);
         }
+
+        if (arg === "skip-checkout") {
+            skip_checkout = true;
+        }
     });
 
     console.log(`[*] Configured for the "${data_owner}/${data_repo}" repository.`);
@@ -364,17 +587,29 @@ async function main() {
     checkForExit();
 
     // First, we checkout the repository for the specified branch/tag/hash. We will
-    // use it to retrieve a clean commit log, ignoring merge commits. We can also use
-    // it as a basis for our list of authors/contributors, as it's not always the
-    // same between the PR and the actual commit.
+    // use it to retrieve a clean commit log, ignoring merge commits. This step creates
+    // as shallow copy, as we are only interested in the history of the branch.
+    // Still, it extracts all of the current files, so it may take a bit of time.
 
-    await ensureDir("./temp");
+    if (!skip_checkout) {
+        console.log(`[*] Checking out the repository at "${last_commit}".`);
+        await dataFetcher.checkoutRepo(last_commit);
+        checkForExit();
+    }
 
+    console.log(`[*] Extracting the commit log between "${first_commit}" and "${last_commit}".`);
+    const commitLogSize = await dataFetcher.countCommitHistory(first_commit, last_commit);
+    const commitLog = await dataFetcher.getCommitHistory(first_commit, last_commit);
+    checkForExit();
+
+    // Second, we parse the extracted commit log, to generate a list of commit hashes
+    // for the next step. We also try to extract the information about this being a
+    // cherry-pick, and not the original commit. We can rely on the commit message body
+    // containing a certain string, from which we can take the original commit hash.
+
+    const commitHashes = dataProcessor.processLog(commitLog, commitLogSize);
+    checkForExit();
 
-    // Second, we try to extract information about this being a cherry-pick. We can
-    // rely on the commit message body containing a certain string, from which we can
-    // take the original commit hash.
-    //
     // Third, we generate a query to the GraphQL API to fetch the information about
     // linked PRs. GraphQL API supports having multiple sub-queries, which can be our
     // gateway to fetching the data for a list of specific hashes.
@@ -383,25 +618,12 @@ async function main() {
     // It's also unclear whether this feature is limited to a certain number of subqueries
     // (say, 100), or not. We may need to do it in batches, as we do with paginated
     // queries.
-    //
-    // Fourth, we consolidate the information. Each run is performed on a certain range
-    // of branches/tags/hashes, and so we store the information we receive in files
-    // associated with this range. This process can be optimized by only working with
-    // smaller ranges, and composing bigger ranges out of them (e.g. using hashes for
-    // X.Y beta 1 and X.Y beta 2, and then X.Y beta 2 and X.Y beta 3, and then generating
-    // a complete list for X.Y-1 and X.Y on the frontend).
-
-    // Commits can have multiple PRs associated with them, so we need to be on the lookout
-    // for rogue entries. Normally, it will always be one pull per commit (except for direct
-    // commits, which will have none), but GitHub may sometimes link commits to PRs in other
-    // repos/otherwise unrelated. So some form of filtering is required.
 
     console.log("[*] Fetching commit data from GitHub.");
     // Pages are starting with 1 for better presentation.
     let page = 1;
     while (page <= dataFetcher.page_count) {
         //const commitsRaw = await dataFetcher.fetchCommits(page);
-        //dataProcessor.processCommits(commitsRaw);
         //checkForExit();
         page++;
 
@@ -410,6 +632,13 @@ async function main() {
         await delay(1500);
     }
 
+    // Fourth, we consolidate the information. Each run is performed on a certain range
+    // of branches/tags/hashes, and so we store the information we receive in files
+    // associated with this range. This process can be optimized by only working with
+    // smaller ranges, and composing bigger ranges out of them (e.g. using hashes for
+    // X.Y beta 1 and X.Y beta 2, and then X.Y beta 2 and X.Y beta 3, and then generating
+    // a complete list for X.Y-1 and X.Y on the frontend).
+
     console.log("[*] Checking the rate limits after.")
     await dataFetcher.checkRates();
     checkForExit();