#!/usr/bin/env python from __future__ import print_function import os import sys import re SOURCE_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) DOCS_DIR = os.path.join(SOURCE_ROOT, 'docs') def main(): os.chdir(SOURCE_ROOT) filepaths = [] totalDirs = 0 try: for root, dirs, files in os.walk(DOCS_DIR): totalDirs += len(dirs) for f in files: if f.endswith('.md'): filepaths.append(os.path.join(root, f)) except KeyboardInterrupt: print('Keyboard interruption. Please try again.') return 0 totalBrokenLinks = 0 for path in filepaths: totalBrokenLinks += getBrokenLinks(path) print('Parsed through ' + str(len(filepaths)) + ' files within docs directory and its ' + str(totalDirs) + ' subdirectories.') print('Found ' + str(totalBrokenLinks) + ' broken relative links.') return totalBrokenLinks def getBrokenLinks(filepath): currentDir = os.path.dirname(filepath) brokenLinks = [] try: f = open(filepath, 'r') lines = f.readlines() except KeyboardInterrupt: print('Keyboard interruption while parsing. Please try again.') finally: f.close() linkRegexLink = re.compile('\[(.*?)\]\((?P(.*?))\)') referenceLinkRegex = re.compile( '^\s{0,3}\[.*?\]:\s*(?P[^<\s]+|<[^<>\r\n]+>)' ) links = [] for line in lines: matchLinks = linkRegexLink.search(line) matchReferenceLinks = referenceLinkRegex.search(line) if matchLinks: relativeLink = matchLinks.group('link') if not str(relativeLink).startswith('http'): links.append(relativeLink) if matchReferenceLinks: referenceLink = matchReferenceLinks.group('link').strip('<>') if not str(referenceLink).startswith('http'): links.append(referenceLink) for link in links: sections = link.split('#') if len(sections) < 2: if not os.path.isfile(os.path.join(currentDir, link)): brokenLinks.append(link) elif str(link).startswith('#'): if not checkSections(sections, lines): brokenLinks.append(link) else: tempFile = os.path.join(currentDir, sections[0]) if os.path.isfile(tempFile): try: newFile = open(tempFile, 'r') newLines = newFile.readlines() except KeyboardInterrupt: print('Keyboard interruption while parsing. Please try again.') finally: newFile.close() if not checkSections(sections, newLines): brokenLinks.append(link) else: brokenLinks.append(link) print_errors(filepath, brokenLinks) return len(brokenLinks) def checkSections(sections, lines): invalidCharsRegex = '[^A-Za-z0-9_ \-]' sectionHeader = sections[1] regexSectionTitle = re.compile('# (?P
.*)') for line in lines: matchHeader = regexSectionTitle.search(line) if matchHeader: # This does the following to slugify a header name: # * Replace whitespace with dashes # * Strip anything that's not alphanumeric or a dash # * Anything quoted with backticks (`) is an exception and will # not have underscores stripped matchHeader = str(matchHeader.group('header')).replace(' ', '-') matchHeader = ''.join( map( lambda match: re.sub(invalidCharsRegex, '', match[0]) + re.sub(invalidCharsRegex + '|_', '', match[1]), re.findall('(`[^`]+`)|([^`]+)', matchHeader), ) ) if matchHeader.lower() == sectionHeader: return True return False def print_errors(filepath, brokenLink): if brokenLink: print("File Location: " + filepath) for link in brokenLink: print("\tBroken links: " + link) if __name__ == '__main__': sys.exit(main())