chore: use vscode-markdown-languageservice for link linting (#36901)

* chore: use vscode-markdown-languageservice for docs link linting

* docs: make links relative
This commit is contained in:
David Sanders 2023-01-24 00:00:25 -08:00 committed by GitHub
parent 37f5881882
commit ca3145a547
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 700 additions and 176 deletions

View file

@ -1,130 +0,0 @@
#!/usr/bin/env python3
from __future__ import print_function
import os
import sys
import re
SOURCE_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
DOCS_DIR = os.path.join(SOURCE_ROOT, 'docs')
def main():
os.chdir(SOURCE_ROOT)
filepaths = []
totalDirs = 0
try:
for root, dirs, files in os.walk(DOCS_DIR):
totalDirs += len(dirs)
for f in files:
if f.endswith('.md'):
filepaths.append(os.path.join(root, f))
except KeyboardInterrupt:
print('Keyboard interruption. Please try again.')
return 0
totalBrokenLinks = 0
for path in filepaths:
totalBrokenLinks += getBrokenLinks(path)
print('Parsed through ' + str(len(filepaths)) +
' files within docs directory and its ' +
str(totalDirs) + ' subdirectories.')
print('Found ' + str(totalBrokenLinks) + ' broken relative links.')
return totalBrokenLinks
def getBrokenLinks(filepath):
currentDir = os.path.dirname(filepath)
brokenLinks = []
try:
f = open(filepath, 'r', encoding="utf-8")
lines = f.readlines()
except KeyboardInterrupt:
print('Keyboard interruption while parsing. Please try again.')
finally:
f.close()
linkRegexLink = re.compile('\[(.*?)\]\((?P<link>(.*?))\)')
referenceLinkRegex = re.compile(
'^\s{0,3}\[.*?\]:\s*(?P<link>[^<\s]+|<[^<>\r\n]+>)'
)
links = []
for line in lines:
matchLinks = linkRegexLink.search(line)
matchReferenceLinks = referenceLinkRegex.search(line)
if matchLinks:
relativeLink = matchLinks.group('link')
if not str(relativeLink).startswith('http'):
links.append(relativeLink)
if matchReferenceLinks:
referenceLink = matchReferenceLinks.group('link').strip('<>')
if not str(referenceLink).startswith('http'):
links.append(referenceLink)
for link in links:
sections = link.split('#')
if len(sections) < 2:
if not os.path.isfile(os.path.join(currentDir, link)):
brokenLinks.append(link)
elif str(link).startswith('#'):
if not checkSections(sections, lines):
brokenLinks.append(link)
else:
tempFile = os.path.join(currentDir, sections[0])
if os.path.isfile(tempFile):
try:
newFile = open(tempFile, 'r', encoding="utf-8")
newLines = newFile.readlines()
except KeyboardInterrupt:
print('Keyboard interruption while parsing. Please try again.')
finally:
newFile.close()
if not checkSections(sections, newLines):
brokenLinks.append(link)
else:
brokenLinks.append(link)
print_errors(filepath, brokenLinks)
return len(brokenLinks)
def checkSections(sections, lines):
invalidCharsRegex = '[^A-Za-z0-9_ \-]'
sectionHeader = sections[1]
regexSectionTitle = re.compile('# (?P<header>.*)')
for line in lines:
matchHeader = regexSectionTitle.search(line)
if matchHeader:
# This does the following to slugify a header name:
# * Replace whitespace with dashes
# * Strip anything that's not alphanumeric or a dash
# * Anything quoted with backticks (`) is an exception and will
# not have underscores stripped
matchHeader = str(matchHeader.group('header')).replace(' ', '-')
matchHeader = ''.join(
map(
lambda match: re.sub(invalidCharsRegex, '', match[0])
+ re.sub(invalidCharsRegex + '|_', '', match[1]),
re.findall('(`[^`]+`)|([^`]+)', matchHeader),
)
)
if matchHeader.lower() == sectionHeader:
return True
return False
def print_errors(filepath, brokenLink):
if brokenLink:
print("File Location: " + filepath)
for link in brokenLink:
print("\tBroken links: " + link)
if __name__ == '__main__':
sys.exit(main())

334
script/lib/markdown.ts Normal file
View file

@ -0,0 +1,334 @@
import * as fs from 'fs';
import * as path from 'path';
import * as MarkdownIt from 'markdown-it';
import {
githubSlugifier,
resolveInternalDocumentLink,
ExternalHref,
FileStat,
HrefKind,
InternalHref,
IMdLinkComputer,
IMdParser,
ITextDocument,
IWorkspace,
MdLink,
MdLinkKind
} from '@dsanders11/vscode-markdown-languageservice';
import { Emitter, Range } from 'vscode-languageserver';
import { TextDocument } from 'vscode-languageserver-textdocument';
import { URI } from 'vscode-uri';
import { findMatchingFiles } from './utils';
import type { Definition, ImageReference, Link, LinkReference } from 'mdast';
import type { fromMarkdown as FromMarkdownFunction } from 'mdast-util-from-markdown';
import type { Node, Position } from 'unist';
import type { visit as VisitFunction } from 'unist-util-visit';
// Helper function to work around import issues with ESM modules and ts-node
// eslint-disable-next-line no-new-func
const dynamicImport = new Function('specifier', 'return import(specifier)');
// Helper function from `vscode-markdown-languageservice` codebase
function tryDecodeUri (str: string): string {
try {
return decodeURI(str);
} catch {
return str;
}
}
// Helper function from `vscode-markdown-languageservice` codebase
function createHref (
sourceDocUri: URI,
link: string,
workspace: IWorkspace
): ExternalHref | InternalHref | undefined {
if (/^[a-z-][a-z-]+:/i.test(link)) {
// Looks like a uri
return { kind: HrefKind.External, uri: URI.parse(tryDecodeUri(link)) };
}
const resolved = resolveInternalDocumentLink(sourceDocUri, link, workspace);
if (!resolved) {
return undefined;
}
return {
kind: HrefKind.Internal,
path: resolved.resource,
fragment: resolved.linkFragment
};
}
function positionToRange (position: Position): Range {
return {
start: {
character: position.start.column - 1,
line: position.start.line - 1
},
end: { character: position.end.column - 1, line: position.end.line - 1 }
};
}
const mdIt = MarkdownIt({ html: true });
export class MarkdownParser implements IMdParser {
slugifier = githubSlugifier;
async tokenize (document: TextDocument) {
return mdIt.parse(document.getText(), {});
}
}
export class DocsWorkspace implements IWorkspace {
private readonly documentCache: Map<string, TextDocument>;
readonly root: string;
constructor (root: string) {
this.documentCache = new Map();
this.root = root;
}
get workspaceFolders () {
return [URI.file(this.root)];
}
async getAllMarkdownDocuments (): Promise<Iterable<ITextDocument>> {
const files = await findMatchingFiles(this.root, (file) =>
file.endsWith('.md')
);
for (const file of files) {
const document = TextDocument.create(
URI.file(file).toString(),
'markdown',
1,
fs.readFileSync(file, 'utf8')
);
this.documentCache.set(file, document);
}
return this.documentCache.values();
}
hasMarkdownDocument (resource: URI) {
const relativePath = path.relative(this.root, resource.path);
return (
!relativePath.startsWith('..') &&
!path.isAbsolute(relativePath) &&
fs.existsSync(resource.path)
);
}
async openMarkdownDocument (resource: URI) {
if (!this.documentCache.has(resource.path)) {
const document = TextDocument.create(
resource.toString(),
'markdown',
1,
fs.readFileSync(resource.path, 'utf8')
);
this.documentCache.set(resource.path, document);
}
return this.documentCache.get(resource.path);
}
async stat (resource: URI): Promise<FileStat | undefined> {
if (this.hasMarkdownDocument(resource)) {
const stats = fs.statSync(resource.path);
return { isDirectory: stats.isDirectory() };
}
return undefined;
}
async readDirectory (): Promise<Iterable<readonly [string, FileStat]>> {
throw new Error('Not implemented');
}
//
// These events are defined to fulfill the interface, but are never emitted
// by this implementation since it's not meant for watching a workspace
//
#onDidChangeMarkdownDocument = new Emitter<ITextDocument>();
onDidChangeMarkdownDocument = this.#onDidChangeMarkdownDocument.event;
#onDidCreateMarkdownDocument = new Emitter<ITextDocument>();
onDidCreateMarkdownDocument = this.#onDidCreateMarkdownDocument.event;
#onDidDeleteMarkdownDocument = new Emitter<URI>();
onDidDeleteMarkdownDocument = this.#onDidDeleteMarkdownDocument.event;
}
export class MarkdownLinkComputer implements IMdLinkComputer {
private readonly workspace: IWorkspace;
constructor (workspace: IWorkspace) {
this.workspace = workspace;
}
async getAllLinks (document: ITextDocument): Promise<MdLink[]> {
const { fromMarkdown } = (await dynamicImport(
'mdast-util-from-markdown'
)) as { fromMarkdown: typeof FromMarkdownFunction };
const tree = fromMarkdown(document.getText());
const links = [
...(await this.#getInlineLinks(document, tree)),
...(await this.#getReferenceLinks(document, tree)),
...(await this.#getLinkDefinitions(document, tree))
];
return links;
}
async #getInlineLinks (
document: ITextDocument,
tree: Node
): Promise<MdLink[]> {
const { visit } = (await dynamicImport('unist-util-visit')) as {
visit: typeof VisitFunction;
};
const documentUri = URI.parse(document.uri);
const links: MdLink[] = [];
visit(
tree,
(node) => node.type === 'link',
(node: Node) => {
const link = node as Link;
const href = createHref(documentUri, link.url, this.workspace);
if (href) {
const range = positionToRange(link.position!);
// NOTE - These haven't been implemented properly, but their
// values aren't used for the link linting use-case
const targetRange = range;
const hrefRange = range;
const fragmentRange = undefined;
links.push({
kind: MdLinkKind.Link,
href,
source: {
hrefText: link.url,
resource: documentUri,
range,
targetRange,
hrefRange,
fragmentRange,
pathText: link.url.split('#')[0]
}
});
}
}
);
return links;
}
async #getReferenceLinks (
document: ITextDocument,
tree: Node
): Promise<MdLink[]> {
const { visit } = (await dynamicImport('unist-util-visit')) as {
visit: typeof VisitFunction;
};
const links: MdLink[] = [];
visit(
tree,
(node) => ['imageReference', 'linkReference'].includes(node.type),
(node: Node) => {
const link = node as ImageReference | LinkReference;
const range = positionToRange(link.position!);
// NOTE - These haven't been implemented properly, but their
// values aren't used for the link linting use-case
const targetRange = range;
const hrefRange = range;
links.push({
kind: MdLinkKind.Link,
href: {
kind: HrefKind.Reference,
ref: link.label!
},
source: {
hrefText: link.label!,
resource: URI.parse(document.uri),
range,
targetRange,
hrefRange,
fragmentRange: undefined,
pathText: link.label!
}
});
}
);
return links;
}
async #getLinkDefinitions (
document: ITextDocument,
tree: Node
): Promise<MdLink[]> {
const { visit } = (await dynamicImport('unist-util-visit')) as {
visit: typeof VisitFunction;
};
const documentUri = URI.parse(document.uri);
const links: MdLink[] = [];
visit(
tree,
(node) => node.type === 'definition',
(node: Node) => {
const definition = node as Definition;
const href = createHref(documentUri, definition.url, this.workspace);
if (href) {
const range = positionToRange(definition.position!);
// NOTE - These haven't been implemented properly, but their
// values aren't used for the link linting use-case
const targetRange = range;
const hrefRange = range;
const fragmentRange = undefined;
links.push({
kind: MdLinkKind.Definition,
href,
ref: {
range,
text: definition.label!
},
source: {
hrefText: definition.url,
resource: documentUri,
range,
targetRange,
hrefRange,
fragmentRange,
pathText: definition.url.split('#')[0]
}
});
}
}
);
return links;
}
}

View file

@ -1,5 +1,6 @@
const { GitProcess } = require('dugite');
const fs = require('fs');
const klaw = require('klaw');
const os = require('os');
const path = require('path');
@ -122,8 +123,29 @@ function chunkFilenames (filenames, offset = 0) {
);
}
/**
* @param {string} top
* @param {(filename: string) => boolean} test
* @returns {Promise<string[]>}
*/
async function findMatchingFiles (top, test) {
return new Promise((resolve, reject) => {
const matches = [];
klaw(top, {
filter: f => path.basename(f) !== '.bin'
})
.on('end', () => resolve(matches))
.on('data', item => {
if (test(item.path)) {
matches.push(item.path);
}
});
});
}
module.exports = {
chunkFilenames,
findMatchingFiles,
getCurrentBranch,
getElectronExec,
getOutDir,

177
script/lint-docs-links.ts Executable file
View file

@ -0,0 +1,177 @@
#!/usr/bin/env ts-node
import * as path from 'path';
import {
createLanguageService,
DiagnosticLevel,
DiagnosticOptions,
ILogger
} from '@dsanders11/vscode-markdown-languageservice';
import * as minimist from 'minimist';
import fetch from 'node-fetch';
import { CancellationTokenSource } from 'vscode-languageserver';
import { URI } from 'vscode-uri';
import {
DocsWorkspace,
MarkdownLinkComputer,
MarkdownParser
} from './lib/markdown';
class NoOpLogger implements ILogger {
log (): void {}
}
const diagnosticOptions: DiagnosticOptions = {
ignoreLinks: [],
validateDuplicateLinkDefinitions: DiagnosticLevel.error,
validateFileLinks: DiagnosticLevel.error,
validateFragmentLinks: DiagnosticLevel.error,
validateMarkdownFileLinkFragments: DiagnosticLevel.error,
validateReferences: DiagnosticLevel.error,
validateUnusedLinkDefinitions: DiagnosticLevel.error
};
async function fetchExternalLink (link: string, checkRedirects = false) {
try {
const response = await fetch(link);
if (response.status !== 200) {
console.log('Broken link', link, response.status, response.statusText);
} else {
if (checkRedirects && response.redirected) {
const wwwUrl = new URL(link);
wwwUrl.hostname = `www.${wwwUrl.hostname}`;
// For now cut down on noise to find meaningful redirects
const wwwRedirect = wwwUrl.toString() === response.url;
const trailingSlashRedirect = `${link}/` === response.url;
if (!wwwRedirect && !trailingSlashRedirect) {
console.log('Link redirection', link, '->', response.url);
}
}
return true;
}
} catch {
console.log('Broken link', link);
}
return false;
}
async function main ({ fetchExternalLinks = false, checkRedirects = false }) {
const workspace = new DocsWorkspace(path.resolve(__dirname, '..', 'docs'));
const parser = new MarkdownParser();
const linkComputer = new MarkdownLinkComputer(workspace);
const languageService = createLanguageService({
workspace,
parser,
logger: new NoOpLogger(),
linkComputer
});
const cts = new CancellationTokenSource();
let errors = false;
const externalLinks = new Set<string>();
try {
// Collect diagnostics for all documents in the workspace
for (const document of await workspace.getAllMarkdownDocuments()) {
for (let link of await languageService.getDocumentLinks(
document,
cts.token
)) {
if (link.target === undefined) {
link =
(await languageService.resolveDocumentLink(link, cts.token)) ??
link;
}
if (
link.target &&
link.target.startsWith('http') &&
new URL(link.target).hostname !== 'localhost'
) {
externalLinks.add(link.target);
}
}
const diagnostics = await languageService.computeDiagnostics(
document,
diagnosticOptions,
cts.token
);
if (diagnostics.length) {
console.log(
'File Location:',
path.relative(workspace.root, URI.parse(document.uri).path)
);
}
for (const diagnostic of diagnostics) {
console.log(
`\tBroken link on line ${diagnostic.range.start.line + 1}:`,
diagnostic.message
);
errors = true;
}
}
} finally {
cts.dispose();
}
if (fetchExternalLinks) {
const externalLinkStates = await Promise.all(
Array.from(externalLinks).map((link) =>
fetchExternalLink(link, checkRedirects)
)
);
errors = errors || !externalLinkStates.every((x) => x);
}
return errors;
}
function parseCommandLine () {
const showUsage = (arg?: string): boolean => {
if (!arg || arg.startsWith('-')) {
console.log(
'Usage: script/lint-docs-links.ts [-h|--help] [--fetch-external-links] ' +
'[--check-redirects]'
);
process.exit(0);
}
return true;
};
const opts = minimist(process.argv.slice(2), {
boolean: ['help', 'fetch-external-links', 'check-redirects'],
stopEarly: true,
unknown: showUsage
});
if (opts.help) showUsage();
return opts;
}
if (process.mainModule === module) {
const opts = parseCommandLine();
main({
fetchExternalLinks: opts['fetch-external-links'],
checkRedirects: opts['check-redirects']
})
.then((errors) => {
if (errors) process.exit(1);
})
.catch((error) => {
console.error(error);
process.exit(1);
});
}

View file

@ -5,11 +5,10 @@ const { GitProcess } = require('dugite');
const childProcess = require('child_process');
const { ESLint } = require('eslint');
const fs = require('fs');
const klaw = require('klaw');
const minimist = require('minimist');
const path = require('path');
const { chunkFilenames } = require('./lib/utils');
const { chunkFilenames, findMatchingFiles } = require('./lib/utils');
const ELECTRON_ROOT = path.normalize(path.dirname(__dirname));
const SOURCE_ROOT = path.resolve(ELECTRON_ROOT, '..');
@ -279,21 +278,6 @@ async function findChangedFiles (top) {
return new Set(absolutePaths);
}
async function findMatchingFiles (top, test) {
return new Promise((resolve, reject) => {
const matches = [];
klaw(top, {
filter: f => path.basename(f) !== '.bin'
})
.on('end', () => resolve(matches))
.on('data', item => {
if (test(item.path)) {
matches.push(item.path);
}
});
});
}
async function findFiles (args, linter) {
let filenames = [];
let includelist = null;

View file

@ -1,6 +1,5 @@
import * as childProcess from 'child_process';
import * as fs from 'fs';
import * as klaw from 'klaw';
import * as minimist from 'minimist';
import * as os from 'os';
import * as path from 'path';
@ -9,7 +8,7 @@ import * as streamJson from 'stream-json';
import { ignore as streamJsonIgnore } from 'stream-json/filters/Ignore';
import { streamArray as streamJsonStreamArray } from 'stream-json/streamers/StreamArray';
import { chunkFilenames } from './lib/utils';
import { chunkFilenames, findMatchingFiles } from './lib/utils';
const SOURCE_ROOT = path.normalize(path.dirname(__dirname));
const LLVM_BIN = path.resolve(
@ -204,24 +203,6 @@ async function runClangTidy (
}
}
async function findMatchingFiles (
top: string,
test: (filename: string) => boolean
): Promise<string[]> {
return new Promise((resolve) => {
const matches = [] as string[];
klaw(top, {
filter: (f) => path.basename(f) !== '.bin'
})
.on('end', () => resolve(matches))
.on('data', (item) => {
if (test(item.path)) {
matches.push(item.path);
}
});
});
}
function parseCommandLine () {
const showUsage = (arg?: string) : boolean => {
if (!arg || arg.startsWith('-')) {