#!/usr/bin/env python
from __future__ import print_function
import os
import sys
import re
SOURCE_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
DOCS_DIR = os.path.join(SOURCE_ROOT, 'docs')
def main():
os.chdir(SOURCE_ROOT)
filepaths = []
totalDirs = 0
try:
for root, dirs, files in os.walk(DOCS_DIR):
totalDirs += len(dirs)
for f in files:
if f.endswith('.md'):
filepaths.append(os.path.join(root, f))
except KeyboardInterrupt:
print('Keyboard interruption. Please try again.')
return 0
totalBrokenLinks = 0
for path in filepaths:
totalBrokenLinks += getBrokenLinks(path)
print('Parsed through ' + str(len(filepaths)) +
' files within docs directory and its ' +
str(totalDirs) + ' subdirectories.')
print('Found ' + str(totalBrokenLinks) + ' broken relative links.')
return totalBrokenLinks
def getBrokenLinks(filepath):
currentDir = os.path.dirname(filepath)
brokenLinks = []
try:
f = open(filepath, 'r')
lines = f.readlines()
except KeyboardInterrupt:
print('Keyboard interruption while parsing. Please try again.')
finally:
f.close()
linkRegexLink = re.compile('\[(.*?)\]\((?P(.*?))\)')
referenceLinkRegex = re.compile(
'^\s{0,3}\[.*?\]:\s*(?P[^<\s]+|<[^<>\r\n]+>)'
)
links = []
for line in lines:
matchLinks = linkRegexLink.search(line)
matchReferenceLinks = referenceLinkRegex.search(line)
if matchLinks:
relativeLink = matchLinks.group('link')
if not str(relativeLink).startswith('http'):
links.append(relativeLink)
if matchReferenceLinks:
referenceLink = matchReferenceLinks.group('link').strip('<>')
if not str(referenceLink).startswith('http'):
links.append(referenceLink)
for link in links:
sections = link.split('#')
if len(sections) < 2:
if not os.path.isfile(os.path.join(currentDir, link)):
brokenLinks.append(link)
elif str(link).startswith('#'):
if not checkSections(sections, lines):
brokenLinks.append(link)
else:
tempFile = os.path.join(currentDir, sections[0])
if os.path.isfile(tempFile):
try:
newFile = open(tempFile, 'r')
newLines = newFile.readlines()
except KeyboardInterrupt:
print('Keyboard interruption while parsing. Please try again.')
finally:
newFile.close()
if not checkSections(sections, newLines):
brokenLinks.append(link)
else:
brokenLinks.append(link)
print_errors(filepath, brokenLinks)
return len(brokenLinks)
def checkSections(sections, lines):
invalidCharsRegex = '[^A-Za-z0-9_ \-]'
sectionHeader = sections[1]
regexSectionTitle = re.compile('# (?P.*)')
for line in lines:
matchHeader = regexSectionTitle.search(line)
if matchHeader:
# This does the following to slugify a header name:
# * Replace whitespace with dashes
# * Strip anything that's not alphanumeric or a dash
# * Anything quoted with backticks (`) is an exception and will
# not have underscores stripped
matchHeader = str(matchHeader.group('header')).replace(' ', '-')
matchHeader = ''.join(
map(
lambda match: re.sub(invalidCharsRegex, '', match[0])
+ re.sub(invalidCharsRegex + '|_', '', match[1]),
re.findall('(`[^`]+`)|([^`]+)', matchHeader),
)
)
if matchHeader.lower() == sectionHeader:
return True
return False
def print_errors(filepath, brokenLink):
if brokenLink:
print("File Location: " + filepath)
for link in brokenLink:
print("\tBroken links: " + link)
if __name__ == '__main__':
sys.exit(main())