|
| 1 | +"""This script help checking inconsistent links. |
| 2 | +
|
| 3 | +That is to say, links that have the same title but go to different places. |
| 4 | +This is useful for screen-reader and accessibility devices, where the user may |
| 5 | +say "Go to X", but if there are 2 links named "X" this creates ambiguity. |
| 6 | +
|
| 7 | +
|
| 8 | +Example (links that have the same name, but different URL): |
| 9 | +
|
| 10 | + We have a JavaScript <a href="javascript.html">API</a> and |
| 11 | + a Python <a href="python.html">API</a>. |
| 12 | +
|
| 13 | +How to fix (give the links different names): |
| 14 | +
|
| 15 | + We have a <a href="javascript.html">JavaScript API</a> and |
| 16 | + a <a href="python.html">Python API</a>. |
| 17 | +""" |
| 18 | + |
| 19 | +import os |
| 20 | +import sys |
| 21 | +from collections import defaultdict |
| 22 | +from urllib.parse import urljoin |
| 23 | + |
| 24 | +from bs4 import BeautifulSoup |
| 25 | + |
| 26 | +# when looking at inconsistent links across pages, |
| 27 | +# a number of text is recurrent and appear on many pages. |
| 28 | +# So we'll ignore these. |
| 29 | + |
| 30 | +ignores = [ |
| 31 | + "#", |
| 32 | + "next", |
| 33 | + "previous", |
| 34 | + "[source]", |
| 35 | + "edit on github", |
| 36 | + "[docs]", |
| 37 | + "read more ...", |
| 38 | + "show source", |
| 39 | + "module", |
| 40 | +] |
| 41 | + |
| 42 | + |
| 43 | +def find_html_files(folder_path): |
| 44 | + """Find all html files in given folder.""" |
| 45 | + html_files = [] |
| 46 | + for root, dirs, files in os.walk(folder_path): |
| 47 | + for file in files: |
| 48 | + if file.endswith(".html"): |
| 49 | + html_files.append(os.path.join(root, file)) |
| 50 | + return html_files |
| 51 | + |
| 52 | + |
| 53 | +class Checker: |
| 54 | + """Link checker.""" |
| 55 | + |
| 56 | + links: dict[str, list] |
| 57 | + |
| 58 | + def __init__(self): |
| 59 | + self.links = defaultdict(list) |
| 60 | + |
| 61 | + def scan(self, html_content, file_path): |
| 62 | + """Scan given file for html links.""" |
| 63 | + # Parse the HTML content using BeautifulSoup |
| 64 | + soup = BeautifulSoup(html_content, "html.parser") |
| 65 | + |
| 66 | + # Dictionary to store URLs and their corresponding titles |
| 67 | + |
| 68 | + # Extract all anchor tags |
| 69 | + for a_tag in soup.find_all("a", href=True): |
| 70 | + url = a_tag["href"] |
| 71 | + |
| 72 | + # These are usually link into the same page ("see below", or even |
| 73 | + # header anchors we thus exclude those. |
| 74 | + if url.startswith("#"): |
| 75 | + continue |
| 76 | + content = a_tag.text.strip().lower() |
| 77 | + if content in ignores: |
| 78 | + continue |
| 79 | + # Some links are "$Title\nNext", or "$Title\nprev", so we only |
| 80 | + # want to look at what is before the `\n` |
| 81 | + if content.split("\n")[0] in ignores: |
| 82 | + continue |
| 83 | + |
| 84 | + fullurl = urljoin(file_path, url) |
| 85 | + self.links[content].append((fullurl, file_path)) |
| 86 | + |
| 87 | + def duplicates(self): |
| 88 | + """Print potential duplicates.""" |
| 89 | + for content, url_pages in self.links.items(): |
| 90 | + uniq_url = {u for u, _ in url_pages} |
| 91 | + if len(uniq_url) >= 2: |
| 92 | + print( |
| 93 | + f'The link text "{content!r}" appears {len(url_pages)} times, ' |
| 94 | + f"and links to {len(uniq_url)} different URLs, on the following pages:" |
| 95 | + ) |
| 96 | + dct = defaultdict(list) |
| 97 | + for u, p in url_pages: |
| 98 | + dct[u].append(p) |
| 99 | + for u, ps in dct.items(): |
| 100 | + print(" ", u, "in") |
| 101 | + for p in ps: |
| 102 | + print(" ", p) |
| 103 | + |
| 104 | + |
| 105 | +if len(sys.argv) == 3 and sys.argv[2] == "--all": |
| 106 | + c = Checker() |
| 107 | + |
| 108 | + for file in find_html_files(sys.argv[1]): |
| 109 | + with open(file) as f: |
| 110 | + data = f.read() |
| 111 | + c.scan(data, file) |
| 112 | + |
| 113 | + c.duplicates() |
| 114 | +elif len(sys.argv) == 2: |
| 115 | + for file in find_html_files(sys.argv[1]): |
| 116 | + with open(file) as f: |
| 117 | + data = f.read() |
| 118 | + c = Checker() |
| 119 | + c.scan(data, file) |
| 120 | + c.duplicates() |
| 121 | +else: |
| 122 | + print( |
| 123 | + """ |
| 124 | +Check page-wise link consistency |
| 125 | +(links with the same name on the same page should go to the same URL) |
| 126 | +
|
| 127 | + python tools/divergent_links.py docs/_build/html/ |
| 128 | +
|
| 129 | +Check site-wide link consistency |
| 130 | +(links with the same name across all pages should go the same URL) |
| 131 | +
|
| 132 | + python tools/divergent_links.py docs/_build/html/ --all |
| 133 | +
|
| 134 | +""" |
| 135 | + ) |
| 136 | + sys.exit(1) |
0 commit comments