2017-12-13 02:32:47 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
2019-06-15 17:26:09 +00:00
|
|
|
from __future__ import print_function
|
2017-12-13 02:32:47 +00:00
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
SOURCE_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
|
|
|
|
DOCS_DIR = os.path.join(SOURCE_ROOT, 'docs')
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
os.chdir(SOURCE_ROOT)
|
|
|
|
|
|
|
|
filepaths = []
|
2017-12-14 03:06:16 +00:00
|
|
|
totalDirs = 0
|
2017-12-13 02:32:47 +00:00
|
|
|
try:
|
|
|
|
for root, dirs, files in os.walk(DOCS_DIR):
|
2017-12-14 03:06:16 +00:00
|
|
|
totalDirs += len(dirs)
|
|
|
|
for f in files:
|
|
|
|
if f.endswith('.md'):
|
|
|
|
filepaths.append(os.path.join(root, f))
|
2017-12-13 02:32:47 +00:00
|
|
|
except KeyboardInterrupt:
|
|
|
|
print('Keyboard interruption. Please try again.')
|
2022-03-21 02:11:21 +00:00
|
|
|
return 0
|
2017-12-13 02:32:47 +00:00
|
|
|
|
|
|
|
totalBrokenLinks = 0
|
|
|
|
for path in filepaths:
|
|
|
|
totalBrokenLinks += getBrokenLinks(path)
|
|
|
|
|
2017-12-14 03:06:16 +00:00
|
|
|
print('Parsed through ' + str(len(filepaths)) +
|
|
|
|
' files within docs directory and its ' +
|
|
|
|
str(totalDirs) + ' subdirectories.')
|
2017-12-13 02:32:47 +00:00
|
|
|
print('Found ' + str(totalBrokenLinks) + ' broken relative links.')
|
2018-09-28 03:16:38 +00:00
|
|
|
return totalBrokenLinks
|
2017-12-13 02:32:47 +00:00
|
|
|
|
|
|
|
|
|
|
|
def getBrokenLinks(filepath):
|
|
|
|
currentDir = os.path.dirname(filepath)
|
|
|
|
brokenLinks = []
|
|
|
|
|
|
|
|
try:
|
2022-04-11 23:05:21 +00:00
|
|
|
f = open(filepath, 'r', encoding="utf-8")
|
2017-12-14 03:06:16 +00:00
|
|
|
lines = f.readlines()
|
2017-12-13 02:32:47 +00:00
|
|
|
except KeyboardInterrupt:
|
2020-10-05 14:38:50 +00:00
|
|
|
print('Keyboard interruption while parsing. Please try again.')
|
2017-12-13 02:32:47 +00:00
|
|
|
finally:
|
2017-12-14 03:06:16 +00:00
|
|
|
f.close()
|
2017-12-13 02:32:47 +00:00
|
|
|
|
2020-10-20 01:46:27 +00:00
|
|
|
linkRegexLink = re.compile('\[(.*?)\]\((?P<link>(.*?))\)')
|
2020-11-02 06:43:21 +00:00
|
|
|
referenceLinkRegex = re.compile(
|
|
|
|
'^\s{0,3}\[.*?\]:\s*(?P<link>[^<\s]+|<[^<>\r\n]+>)'
|
|
|
|
)
|
2017-12-13 02:32:47 +00:00
|
|
|
links = []
|
|
|
|
for line in lines:
|
2020-10-20 01:46:27 +00:00
|
|
|
matchLinks = linkRegexLink.search(line)
|
|
|
|
matchReferenceLinks = referenceLinkRegex.search(line)
|
2017-12-13 02:32:47 +00:00
|
|
|
if matchLinks:
|
2020-10-20 01:46:27 +00:00
|
|
|
relativeLink = matchLinks.group('link')
|
2017-12-13 02:32:47 +00:00
|
|
|
if not str(relativeLink).startswith('http'):
|
|
|
|
links.append(relativeLink)
|
2020-10-20 01:46:27 +00:00
|
|
|
if matchReferenceLinks:
|
|
|
|
referenceLink = matchReferenceLinks.group('link').strip('<>')
|
|
|
|
if not str(referenceLink).startswith('http'):
|
|
|
|
links.append(referenceLink)
|
2017-12-13 02:32:47 +00:00
|
|
|
|
|
|
|
for link in links:
|
|
|
|
sections = link.split('#')
|
2018-09-16 17:24:07 +00:00
|
|
|
if len(sections) < 2:
|
|
|
|
if not os.path.isfile(os.path.join(currentDir, link)):
|
|
|
|
brokenLinks.append(link)
|
|
|
|
elif str(link).startswith('#'):
|
|
|
|
if not checkSections(sections, lines):
|
|
|
|
brokenLinks.append(link)
|
|
|
|
else:
|
|
|
|
tempFile = os.path.join(currentDir, sections[0])
|
|
|
|
if os.path.isfile(tempFile):
|
|
|
|
try:
|
2022-04-11 23:05:21 +00:00
|
|
|
newFile = open(tempFile, 'r', encoding="utf-8")
|
2018-09-16 17:24:07 +00:00
|
|
|
newLines = newFile.readlines()
|
|
|
|
except KeyboardInterrupt:
|
2020-10-05 14:38:50 +00:00
|
|
|
print('Keyboard interruption while parsing. Please try again.')
|
2018-09-16 17:24:07 +00:00
|
|
|
finally:
|
|
|
|
newFile.close()
|
|
|
|
|
|
|
|
if not checkSections(sections, newLines):
|
2017-12-13 02:32:47 +00:00
|
|
|
brokenLinks.append(link)
|
|
|
|
else:
|
|
|
|
brokenLinks.append(link)
|
|
|
|
|
2018-09-16 17:24:07 +00:00
|
|
|
|
2017-12-13 02:32:47 +00:00
|
|
|
print_errors(filepath, brokenLinks)
|
|
|
|
return len(brokenLinks)
|
|
|
|
|
|
|
|
|
2017-12-14 03:06:16 +00:00
|
|
|
def checkSections(sections, lines):
|
2020-11-18 19:55:11 +00:00
|
|
|
invalidCharsRegex = '[^A-Za-z0-9_ \-]'
|
|
|
|
sectionHeader = sections[1]
|
2017-12-13 02:32:47 +00:00
|
|
|
regexSectionTitle = re.compile('# (?P<header>.*)')
|
|
|
|
for line in lines:
|
|
|
|
matchHeader = regexSectionTitle.search(line)
|
|
|
|
if matchHeader:
|
2020-11-18 19:55:11 +00:00
|
|
|
# This does the following to slugify a header name:
|
|
|
|
# * Replace whitespace with dashes
|
|
|
|
# * Strip anything that's not alphanumeric or a dash
|
|
|
|
# * Anything quoted with backticks (`) is an exception and will
|
|
|
|
# not have underscores stripped
|
|
|
|
matchHeader = str(matchHeader.group('header')).replace(' ', '-')
|
|
|
|
matchHeader = ''.join(
|
|
|
|
map(
|
|
|
|
lambda match: re.sub(invalidCharsRegex, '', match[0])
|
|
|
|
+ re.sub(invalidCharsRegex + '|_', '', match[1]),
|
|
|
|
re.findall('(`[^`]+`)|([^`]+)', matchHeader),
|
|
|
|
)
|
|
|
|
)
|
|
|
|
if matchHeader.lower() == sectionHeader:
|
|
|
|
return True
|
2017-12-13 02:32:47 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def print_errors(filepath, brokenLink):
|
|
|
|
if brokenLink:
|
2019-06-15 17:26:09 +00:00
|
|
|
print("File Location: " + filepath)
|
2017-12-13 02:32:47 +00:00
|
|
|
for link in brokenLink:
|
2019-06-15 17:26:09 +00:00
|
|
|
print("\tBroken links: " + link)
|
2017-12-13 02:32:47 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
sys.exit(main())
|