chore: use vscode-markdown-languageservice for link linting (#36901)

* chore: use vscode-markdown-languageservice for docs link linting * docs: make links relative
2023-01-24 00:00:25 -08:00 · 2023-01-24 00:00:25 -08:00 · ca3145a547
commit ca3145a547
parent 37f5881882
10 changed files with 700 additions and 176 deletions
--- a/script/check-relative-doc-links.py
+++ b/script/check-relative-doc-links.py
@ -1,130 +0,0 @@
-#!/usr/bin/env python3
-
-from __future__ import print_function
-import os
-import sys
-import re
-
-
-SOURCE_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
-DOCS_DIR = os.path.join(SOURCE_ROOT, 'docs')
-
-
-def main():
-  os.chdir(SOURCE_ROOT)
-
-  filepaths = []
-  totalDirs = 0
-  try:
-    for root, dirs, files in os.walk(DOCS_DIR):
-      totalDirs += len(dirs)
-      for f in files:
-        if f.endswith('.md'):
-          filepaths.append(os.path.join(root, f))
-  except KeyboardInterrupt:
-    print('Keyboard interruption. Please try again.')
-    return 0
-
-  totalBrokenLinks = 0
-  for path in filepaths:
-    totalBrokenLinks += getBrokenLinks(path)
-
-  print('Parsed through ' + str(len(filepaths)) +
-        ' files within docs directory and its ' +
-        str(totalDirs) + ' subdirectories.')
-  print('Found ' + str(totalBrokenLinks) + ' broken relative links.')
-  return totalBrokenLinks
-
-
-def getBrokenLinks(filepath):
-  currentDir = os.path.dirname(filepath)
-  brokenLinks = []
-
-  try:
-    f = open(filepath, 'r', encoding="utf-8")
-    lines = f.readlines()
-  except KeyboardInterrupt:
-    print('Keyboard interruption while parsing. Please try again.')
-  finally:
-    f.close()
-
-  linkRegexLink = re.compile('\[(.*?)\]\((?P<link>(.*?))\)')
-  referenceLinkRegex = re.compile(
-      '^\s{0,3}\[.*?\]:\s*(?P<link>[^<\s]+|<[^<>\r\n]+>)'
-  )
-  links = []
-  for line in lines:
-    matchLinks = linkRegexLink.search(line)
-    matchReferenceLinks = referenceLinkRegex.search(line)
-    if matchLinks:
-      relativeLink = matchLinks.group('link')
-      if not str(relativeLink).startswith('http'):
-        links.append(relativeLink)
-    if matchReferenceLinks:
-      referenceLink = matchReferenceLinks.group('link').strip('<>')
-      if not str(referenceLink).startswith('http'):
-        links.append(referenceLink)
-
-  for link in links:
-    sections = link.split('#')
-    if len(sections) < 2:
-      if not os.path.isfile(os.path.join(currentDir, link)):
-        brokenLinks.append(link)
-    elif str(link).startswith('#'):
-      if not checkSections(sections, lines):
-        brokenLinks.append(link)
-    else:
-      tempFile = os.path.join(currentDir, sections[0])
-      if os.path.isfile(tempFile):
-        try:
-          newFile = open(tempFile, 'r', encoding="utf-8")
-          newLines = newFile.readlines()
-        except KeyboardInterrupt:
-          print('Keyboard interruption while parsing. Please try again.')
-        finally:
-          newFile.close()
-
-        if not checkSections(sections, newLines):
-          brokenLinks.append(link)
-      else:
-        brokenLinks.append(link)
-
-
-  print_errors(filepath, brokenLinks)
-  return len(brokenLinks)
-
-
-def checkSections(sections, lines):
-  invalidCharsRegex = '[^A-Za-z0-9_ \-]'
-  sectionHeader = sections[1]
-  regexSectionTitle = re.compile('# (?P<header>.*)')
-  for line in lines:
-    matchHeader = regexSectionTitle.search(line)
-    if matchHeader:
-      # This does the following to slugify a header name:
-      #  * Replace whitespace with dashes
-      #  * Strip anything that's not alphanumeric or a dash
-      #  * Anything quoted with backticks (`) is an exception and will
-      #    not have underscores stripped
-      matchHeader = str(matchHeader.group('header')).replace(' ', '-')
-      matchHeader = ''.join(
-        map(
-          lambda match: re.sub(invalidCharsRegex, '', match[0])
-          + re.sub(invalidCharsRegex + '|_', '', match[1]),
-          re.findall('(`[^`]+`)|([^`]+)', matchHeader),
-        )
-      )
-      if matchHeader.lower() == sectionHeader:
-        return True
-  return False
-
-
-def print_errors(filepath, brokenLink):
-  if brokenLink:
-    print("File Location: " + filepath)
-    for link in brokenLink:
-      print("\tBroken links: " + link)
-
-
-if __name__ == '__main__':
-  sys.exit(main())
--- a/script/lib/markdown.ts
+++ b/script/lib/markdown.ts
@ -0,0 +1,334 @@
+import * as fs from 'fs';
+import * as path from 'path';
+
+import * as MarkdownIt from 'markdown-it';
+import {
+  githubSlugifier,
+  resolveInternalDocumentLink,
+  ExternalHref,
+  FileStat,
+  HrefKind,
+  InternalHref,
+  IMdLinkComputer,
+  IMdParser,
+  ITextDocument,
+  IWorkspace,
+  MdLink,
+  MdLinkKind
+} from '@dsanders11/vscode-markdown-languageservice';
+import { Emitter, Range } from 'vscode-languageserver';
+import { TextDocument } from 'vscode-languageserver-textdocument';
+import { URI } from 'vscode-uri';
+
+import { findMatchingFiles } from './utils';
+
+import type { Definition, ImageReference, Link, LinkReference } from 'mdast';
+import type { fromMarkdown as FromMarkdownFunction } from 'mdast-util-from-markdown';
+import type { Node, Position } from 'unist';
+import type { visit as VisitFunction } from 'unist-util-visit';
+
+// Helper function to work around import issues with ESM modules and ts-node
+// eslint-disable-next-line no-new-func
+const dynamicImport = new Function('specifier', 'return import(specifier)');
+
+// Helper function from `vscode-markdown-languageservice` codebase
+function tryDecodeUri (str: string): string {
+  try {
+    return decodeURI(str);
+  } catch {
+    return str;
+  }
+}
+
+// Helper function from `vscode-markdown-languageservice` codebase
+function createHref (
+  sourceDocUri: URI,
+  link: string,
+  workspace: IWorkspace
+): ExternalHref | InternalHref | undefined {
+  if (/^[a-z-][a-z-]+:/i.test(link)) {
+    // Looks like a uri
+    return { kind: HrefKind.External, uri: URI.parse(tryDecodeUri(link)) };
+  }
+
+  const resolved = resolveInternalDocumentLink(sourceDocUri, link, workspace);
+  if (!resolved) {
+    return undefined;
+  }
+
+  return {
+    kind: HrefKind.Internal,
+    path: resolved.resource,
+    fragment: resolved.linkFragment
+  };
+}
+
+function positionToRange (position: Position): Range {
+  return {
+    start: {
+      character: position.start.column - 1,
+      line: position.start.line - 1
+    },
+    end: { character: position.end.column - 1, line: position.end.line - 1 }
+  };
+}
+
+const mdIt = MarkdownIt({ html: true });
+
+export class MarkdownParser implements IMdParser {
+  slugifier = githubSlugifier;
+
+  async tokenize (document: TextDocument) {
+    return mdIt.parse(document.getText(), {});
+  }
+}
+
+export class DocsWorkspace implements IWorkspace {
+  private readonly documentCache: Map<string, TextDocument>;
+  readonly root: string;
+
+  constructor (root: string) {
+    this.documentCache = new Map();
+    this.root = root;
+  }
+
+  get workspaceFolders () {
+    return [URI.file(this.root)];
+  }
+
+  async getAllMarkdownDocuments (): Promise<Iterable<ITextDocument>> {
+    const files = await findMatchingFiles(this.root, (file) =>
+      file.endsWith('.md')
+    );
+
+    for (const file of files) {
+      const document = TextDocument.create(
+        URI.file(file).toString(),
+        'markdown',
+        1,
+        fs.readFileSync(file, 'utf8')
+      );
+
+      this.documentCache.set(file, document);
+    }
+
+    return this.documentCache.values();
+  }
+
+  hasMarkdownDocument (resource: URI) {
+    const relativePath = path.relative(this.root, resource.path);
+    return (
+      !relativePath.startsWith('..') &&
+      !path.isAbsolute(relativePath) &&
+      fs.existsSync(resource.path)
+    );
+  }
+
+  async openMarkdownDocument (resource: URI) {
+    if (!this.documentCache.has(resource.path)) {
+      const document = TextDocument.create(
+        resource.toString(),
+        'markdown',
+        1,
+        fs.readFileSync(resource.path, 'utf8')
+      );
+
+      this.documentCache.set(resource.path, document);
+    }
+
+    return this.documentCache.get(resource.path);
+  }
+
+  async stat (resource: URI): Promise<FileStat | undefined> {
+    if (this.hasMarkdownDocument(resource)) {
+      const stats = fs.statSync(resource.path);
+      return { isDirectory: stats.isDirectory() };
+    }
+
+    return undefined;
+  }
+
+  async readDirectory (): Promise<Iterable<readonly [string, FileStat]>> {
+    throw new Error('Not implemented');
+  }
+
+  //
+  // These events are defined to fulfill the interface, but are never emitted
+  // by this implementation since it's not meant for watching a workspace
+  //
+
+  #onDidChangeMarkdownDocument = new Emitter<ITextDocument>();
+  onDidChangeMarkdownDocument = this.#onDidChangeMarkdownDocument.event;
+
+  #onDidCreateMarkdownDocument = new Emitter<ITextDocument>();
+  onDidCreateMarkdownDocument = this.#onDidCreateMarkdownDocument.event;
+
+  #onDidDeleteMarkdownDocument = new Emitter<URI>();
+  onDidDeleteMarkdownDocument = this.#onDidDeleteMarkdownDocument.event;
+}
+
+export class MarkdownLinkComputer implements IMdLinkComputer {
+  private readonly workspace: IWorkspace;
+
+  constructor (workspace: IWorkspace) {
+    this.workspace = workspace;
+  }
+
+  async getAllLinks (document: ITextDocument): Promise<MdLink[]> {
+    const { fromMarkdown } = (await dynamicImport(
+      'mdast-util-from-markdown'
+    )) as { fromMarkdown: typeof FromMarkdownFunction };
+
+    const tree = fromMarkdown(document.getText());
+
+    const links = [
+      ...(await this.#getInlineLinks(document, tree)),
+      ...(await this.#getReferenceLinks(document, tree)),
+      ...(await this.#getLinkDefinitions(document, tree))
+    ];
+
+    return links;
+  }
+
+  async #getInlineLinks (
+    document: ITextDocument,
+    tree: Node
+  ): Promise<MdLink[]> {
+    const { visit } = (await dynamicImport('unist-util-visit')) as {
+      visit: typeof VisitFunction;
+    };
+
+    const documentUri = URI.parse(document.uri);
+    const links: MdLink[] = [];
+
+    visit(
+      tree,
+      (node) => node.type === 'link',
+      (node: Node) => {
+        const link = node as Link;
+        const href = createHref(documentUri, link.url, this.workspace);
+
+        if (href) {
+          const range = positionToRange(link.position!);
+
+          // NOTE - These haven't been implemented properly, but their
+          //        values aren't used for the link linting use-case
+          const targetRange = range;
+          const hrefRange = range;
+          const fragmentRange = undefined;
+
+          links.push({
+            kind: MdLinkKind.Link,
+            href,
+            source: {
+              hrefText: link.url,
+              resource: documentUri,
+              range,
+              targetRange,
+              hrefRange,
+              fragmentRange,
+              pathText: link.url.split('#')[0]
+            }
+          });
+        }
+      }
+    );
+
+    return links;
+  }
+
+  async #getReferenceLinks (
+    document: ITextDocument,
+    tree: Node
+  ): Promise<MdLink[]> {
+    const { visit } = (await dynamicImport('unist-util-visit')) as {
+      visit: typeof VisitFunction;
+    };
+
+    const links: MdLink[] = [];
+
+    visit(
+      tree,
+      (node) => ['imageReference', 'linkReference'].includes(node.type),
+      (node: Node) => {
+        const link = node as ImageReference | LinkReference;
+        const range = positionToRange(link.position!);
+
+        // NOTE - These haven't been implemented properly, but their
+        //        values aren't used for the link linting use-case
+        const targetRange = range;
+        const hrefRange = range;
+
+        links.push({
+          kind: MdLinkKind.Link,
+          href: {
+            kind: HrefKind.Reference,
+            ref: link.label!
+          },
+          source: {
+            hrefText: link.label!,
+            resource: URI.parse(document.uri),
+            range,
+            targetRange,
+            hrefRange,
+            fragmentRange: undefined,
+            pathText: link.label!
+          }
+        });
+      }
+    );
+
+    return links;
+  }
+
+  async #getLinkDefinitions (
+    document: ITextDocument,
+    tree: Node
+  ): Promise<MdLink[]> {
+    const { visit } = (await dynamicImport('unist-util-visit')) as {
+      visit: typeof VisitFunction;
+    };
+
+    const documentUri = URI.parse(document.uri);
+    const links: MdLink[] = [];
+
+    visit(
+      tree,
+      (node) => node.type === 'definition',
+      (node: Node) => {
+        const definition = node as Definition;
+        const href = createHref(documentUri, definition.url, this.workspace);
+
+        if (href) {
+          const range = positionToRange(definition.position!);
+
+          // NOTE - These haven't been implemented properly, but their
+          //        values aren't used for the link linting use-case
+          const targetRange = range;
+          const hrefRange = range;
+          const fragmentRange = undefined;
+
+          links.push({
+            kind: MdLinkKind.Definition,
+            href,
+            ref: {
+              range,
+              text: definition.label!
+            },
+            source: {
+              hrefText: definition.url,
+              resource: documentUri,
+              range,
+              targetRange,
+              hrefRange,
+              fragmentRange,
+              pathText: definition.url.split('#')[0]
+            }
+          });
+        }
+      }
+    );
+
+    return links;
+  }
+}
--- a/script/lib/utils.js
+++ b/script/lib/utils.js
@ -1,5 +1,6 @@
 const { GitProcess } = require('dugite');
 const fs = require('fs');
+const klaw = require('klaw');
 const os = require('os');
 const path = require('path');

@ -122,8 +123,29 @@ function chunkFilenames (filenames, offset = 0) {
  );
 }

+/**
+ * @param {string} top
+ * @param {(filename: string) => boolean} test
+ * @returns {Promise<string[]>}
+*/
+async function findMatchingFiles (top, test) {
+  return new Promise((resolve, reject) => {
+    const matches = [];
+    klaw(top, {
+      filter: f => path.basename(f) !== '.bin'
+    })
+      .on('end', () => resolve(matches))
+      .on('data', item => {
+        if (test(item.path)) {
+          matches.push(item.path);
+        }
+      });
+  });
+}
+
 module.exports = {
  chunkFilenames,
+  findMatchingFiles,
  getCurrentBranch,
  getElectronExec,
  getOutDir,
--- a/script/lint-docs-links.ts
+++ b/script/lint-docs-links.ts
@ -0,0 +1,177 @@
+#!/usr/bin/env ts-node
+
+import * as path from 'path';
+
+import {
+  createLanguageService,
+  DiagnosticLevel,
+  DiagnosticOptions,
+  ILogger
+} from '@dsanders11/vscode-markdown-languageservice';
+import * as minimist from 'minimist';
+import fetch from 'node-fetch';
+import { CancellationTokenSource } from 'vscode-languageserver';
+import { URI } from 'vscode-uri';
+
+import {
+  DocsWorkspace,
+  MarkdownLinkComputer,
+  MarkdownParser
+} from './lib/markdown';
+
+class NoOpLogger implements ILogger {
+  log (): void {}
+}
+
+const diagnosticOptions: DiagnosticOptions = {
+  ignoreLinks: [],
+  validateDuplicateLinkDefinitions: DiagnosticLevel.error,
+  validateFileLinks: DiagnosticLevel.error,
+  validateFragmentLinks: DiagnosticLevel.error,
+  validateMarkdownFileLinkFragments: DiagnosticLevel.error,
+  validateReferences: DiagnosticLevel.error,
+  validateUnusedLinkDefinitions: DiagnosticLevel.error
+};
+
+async function fetchExternalLink (link: string, checkRedirects = false) {
+  try {
+    const response = await fetch(link);
+    if (response.status !== 200) {
+      console.log('Broken link', link, response.status, response.statusText);
+    } else {
+      if (checkRedirects && response.redirected) {
+        const wwwUrl = new URL(link);
+        wwwUrl.hostname = `www.${wwwUrl.hostname}`;
+
+        // For now cut down on noise to find meaningful redirects
+        const wwwRedirect = wwwUrl.toString() === response.url;
+        const trailingSlashRedirect = `${link}/` === response.url;
+
+        if (!wwwRedirect && !trailingSlashRedirect) {
+          console.log('Link redirection', link, '->', response.url);
+        }
+      }
+
+      return true;
+    }
+  } catch {
+    console.log('Broken link', link);
+  }
+
+  return false;
+}
+
+async function main ({ fetchExternalLinks = false, checkRedirects = false }) {
+  const workspace = new DocsWorkspace(path.resolve(__dirname, '..', 'docs'));
+  const parser = new MarkdownParser();
+  const linkComputer = new MarkdownLinkComputer(workspace);
+  const languageService = createLanguageService({
+    workspace,
+    parser,
+    logger: new NoOpLogger(),
+    linkComputer
+  });
+
+  const cts = new CancellationTokenSource();
+  let errors = false;
+
+  const externalLinks = new Set<string>();
+
+  try {
+    // Collect diagnostics for all documents in the workspace
+    for (const document of await workspace.getAllMarkdownDocuments()) {
+      for (let link of await languageService.getDocumentLinks(
+        document,
+        cts.token
+      )) {
+        if (link.target === undefined) {
+          link =
+            (await languageService.resolveDocumentLink(link, cts.token)) ??
+            link;
+        }
+
+        if (
+          link.target &&
+          link.target.startsWith('http') &&
+          new URL(link.target).hostname !== 'localhost'
+        ) {
+          externalLinks.add(link.target);
+        }
+      }
+      const diagnostics = await languageService.computeDiagnostics(
+        document,
+        diagnosticOptions,
+        cts.token
+      );
+
+      if (diagnostics.length) {
+        console.log(
+          'File Location:',
+          path.relative(workspace.root, URI.parse(document.uri).path)
+        );
+      }
+
+      for (const diagnostic of diagnostics) {
+        console.log(
+          `\tBroken link on line ${diagnostic.range.start.line + 1}:`,
+          diagnostic.message
+        );
+        errors = true;
+      }
+    }
+  } finally {
+    cts.dispose();
+  }
+
+  if (fetchExternalLinks) {
+    const externalLinkStates = await Promise.all(
+      Array.from(externalLinks).map((link) =>
+        fetchExternalLink(link, checkRedirects)
+      )
+    );
+
+    errors = errors || !externalLinkStates.every((x) => x);
+  }
+
+  return errors;
+}
+
+function parseCommandLine () {
+  const showUsage = (arg?: string): boolean => {
+    if (!arg || arg.startsWith('-')) {
+      console.log(
+        'Usage: script/lint-docs-links.ts [-h|--help] [--fetch-external-links] ' +
+          '[--check-redirects]'
+      );
+      process.exit(0);
+    }
+
+    return true;
+  };
+
+  const opts = minimist(process.argv.slice(2), {
+    boolean: ['help', 'fetch-external-links', 'check-redirects'],
+    stopEarly: true,
+    unknown: showUsage
+  });
+
+  if (opts.help) showUsage();
+
+  return opts;
+}
+
+if (process.mainModule === module) {
+  const opts = parseCommandLine();
+
+  main({
+    fetchExternalLinks: opts['fetch-external-links'],
+    checkRedirects: opts['check-redirects']
+  })
+    .then((errors) => {
+      if (errors) process.exit(1);
+    })
+    .catch((error) => {
+      console.error(error);
+      process.exit(1);
+    });
+}
--- a/script/lint.js
+++ b/script/lint.js
@ -5,11 +5,10 @@ const { GitProcess } = require('dugite');
 const childProcess = require('child_process');
 const { ESLint } = require('eslint');
 const fs = require('fs');
-const klaw = require('klaw');
 const minimist = require('minimist');
 const path = require('path');

-const { chunkFilenames } = require('./lib/utils');
+const { chunkFilenames, findMatchingFiles } = require('./lib/utils');

 const ELECTRON_ROOT = path.normalize(path.dirname(__dirname));
 const SOURCE_ROOT = path.resolve(ELECTRON_ROOT, '..');
@ -279,21 +278,6 @@ async function findChangedFiles (top) {
  return new Set(absolutePaths);
 }

-async function findMatchingFiles (top, test) {
-  return new Promise((resolve, reject) => {
-    const matches = [];
-    klaw(top, {
-      filter: f => path.basename(f) !== '.bin'
-    })
-      .on('end', () => resolve(matches))
-      .on('data', item => {
-        if (test(item.path)) {
-          matches.push(item.path);
-        }
-      });
-  });
-}
-
 async function findFiles (args, linter) {
  let filenames = [];
  let includelist = null;
--- a/script/run-clang-tidy.ts
+++ b/script/run-clang-tidy.ts
@ -1,6 +1,5 @@
 import * as childProcess from 'child_process';
 import * as fs from 'fs';
-import * as klaw from 'klaw';
 import * as minimist from 'minimist';
 import * as os from 'os';
 import * as path from 'path';
@ -9,7 +8,7 @@ import * as streamJson from 'stream-json';
 import { ignore as streamJsonIgnore } from 'stream-json/filters/Ignore';
 import { streamArray as streamJsonStreamArray } from 'stream-json/streamers/StreamArray';

-import { chunkFilenames } from './lib/utils';
+import { chunkFilenames, findMatchingFiles } from './lib/utils';

 const SOURCE_ROOT = path.normalize(path.dirname(__dirname));
 const LLVM_BIN = path.resolve(
@ -204,24 +203,6 @@ async function runClangTidy (
  }
 }

-async function findMatchingFiles (
-  top: string,
-  test: (filename: string) => boolean
-): Promise<string[]> {
-  return new Promise((resolve) => {
-    const matches = [] as string[];
-    klaw(top, {
-      filter: (f) => path.basename(f) !== '.bin'
-    })
-      .on('end', () => resolve(matches))
-      .on('data', (item) => {
-        if (test(item.path)) {
-          matches.push(item.path);
-        }
-      });
-  });
-}
-
 function parseCommandLine () {
  const showUsage = (arg?: string) : boolean => {
    if (!arg || arg.startsWith('-')) {