Skip to content

Commit 7a42c1e

Browse files
committed
Synced webpages from sitemap to Document model for search.
1 parent 72b3840 commit 7a42c1e

File tree

8 files changed

+393
-12
lines changed

8 files changed

+393
-12
lines changed

docs/management/commands/update_docs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ def build_doc_release(self, release, force=False, interactive=False):
132132
if self.verbosity >= 1:
133133
self.stdout.write(f"Starting update for {release} at {datetime.now()}...")
134134

135+
release.sync_from_sitemap(force=force)
136+
135137
# checkout_dir is shared for all languages.
136138
checkout_dir = settings.DOCS_BUILD_ROOT.joinpath("sources", release.version)
137139
parent_build_dir = settings.DOCS_BUILD_ROOT.joinpath(

docs/models.py

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from functools import partial, reduce
66
from pathlib import Path
77

8+
import requests
89
from django.conf import settings
910
from django.contrib.postgres.indexes import GinIndex
1011
from django.contrib.postgres.search import (
@@ -34,8 +35,16 @@
3435
START_SEL,
3536
STOP_SEL,
3637
TSEARCH_CONFIG_LANGUAGES,
38+
DocumentationCategory,
39+
fetch_html,
3740
get_document_search_vector,
3841
)
42+
from .utils import extract_inner_html
43+
44+
45+
def get_search_config(lang):
46+
"""Determine the PostgreSQL search language"""
47+
return TSEARCH_CONFIG_LANGUAGES.get(lang[:2], DEFAULT_TEXT_SEARCH_CONFIG)
3948

4049

4150
class DocumentReleaseQuerySet(models.QuerySet):
@@ -175,7 +184,7 @@ def sync_to_db(self, decoded_documents):
175184
the database. Deletes all the release's documents first then
176185
reinserts them as needed.
177186
"""
178-
self.documents.all().delete()
187+
self.documents.exclude(metadata__parents=DocumentationCategory.WEBSITE).delete()
179188

180189
# Read excluded paths from robots.docs.txt.
181190
robots_path = settings.BASE_DIR.joinpath(
@@ -206,16 +215,66 @@ def sync_to_db(self, decoded_documents):
206215
path=document_path,
207216
title=html.unescape(strip_tags(document["title"])),
208217
metadata=document,
209-
config=TSEARCH_CONFIG_LANGUAGES.get(
210-
self.lang[:2], DEFAULT_TEXT_SEARCH_CONFIG
211-
),
218+
config=get_search_config(self.lang),
212219
)
213-
for document in self.documents.all():
220+
for document in self.documents.exclude(
221+
metadata__parents=DocumentationCategory.WEBSITE
222+
):
214223
document.metadata["breadcrumbs"] = list(
215224
Document.objects.breadcrumbs(document).values("title", "path")
216225
)
217226
document.save(update_fields=("metadata",))
218227

228+
def sync_from_sitemap(self, force=False):
229+
from djangoproject.urls.www import sitemaps
230+
231+
if not self.is_dev:
232+
return
233+
234+
if force:
235+
Document.objects.filter(
236+
metadata__parents=DocumentationCategory.WEBSITE
237+
).delete()
238+
239+
doc_urls = set(
240+
Document.objects.filter(
241+
metadata__parents=DocumentationCategory.WEBSITE
242+
).values_list("path", flat=True)
243+
)
244+
245+
for sitemap in sitemaps.values():
246+
for url in sitemap().get_urls():
247+
path = url["location"]
248+
if path in doc_urls:
249+
continue
250+
try:
251+
page_html = fetch_html(path)
252+
except requests.RequestException:
253+
continue
254+
try:
255+
main_html = extract_inner_html(page_html, tag="main")
256+
title = extract_inner_html(page_html, tag="h1")
257+
except ValueError:
258+
continue
259+
Document.objects.create(
260+
release=self,
261+
path=path,
262+
title=title,
263+
metadata={
264+
"body": main_html,
265+
"breadcrumbs": [
266+
{
267+
"path": DocumentationCategory.WEBSITE,
268+
"title": "Website",
269+
},
270+
],
271+
"parents": DocumentationCategory.WEBSITE,
272+
"title": title,
273+
"toc": "",
274+
},
275+
config=get_search_config(self.lang),
276+
)
277+
219278

220279
def _clean_document_path(path):
221280
# We have to be a bit careful to reverse-engineer the correct
@@ -228,7 +287,9 @@ def _clean_document_path(path):
228287

229288

230289
def document_url(doc):
231-
if doc.path:
290+
if doc.metadata.get("parents") == DocumentationCategory.WEBSITE:
291+
return doc.path
292+
elif doc.path:
232293
kwargs = {
233294
"lang": doc.release.lang,
234295
"version": doc.release.version,
@@ -273,6 +334,14 @@ def search(self, query_text, release, document_category=None):
273334
config=models.F("config"),
274335
)
275336
base_filter = Q(release_id=release.id)
337+
if release.lang == "en" and not release.is_dev:
338+
dev_release = DocumentRelease.objects.get_by_version_and_lang(
339+
"dev", "en"
340+
)
341+
base_filter |= Q(
342+
release_id=dev_release.id,
343+
metadata__parents=DocumentationCategory.WEBSITE,
344+
)
276345
if document_category:
277346
base_filter &= Q(metadata__parents__startswith=document_category)
278347
base_qs = (

docs/search.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import requests
12
from django.contrib.postgres.search import SearchVector
23
from django.db.models import TextChoices
34
from django.db.models.fields.json import KeyTextTransform
@@ -67,10 +68,41 @@ class DocumentationCategory(TextChoices):
6768
TOPICS = "topics", _("Using Django")
6869
HOWTO = "howto", _("How-to guides")
6970
RELEASE_NOTES = "releases", _("Release notes")
71+
WEBSITE = "website", _("Django Website")
7072

7173
@classmethod
7274
def parse(cls, value, default=None):
7375
try:
7476
return cls(value)
7577
except ValueError:
7678
return None
79+
80+
81+
def fetch_html(url, timeout=10):
82+
"""
83+
Fetch the HTML of a page if status code is 200.
84+
Simulates a human browser and accepts only text/html.
85+
"""
86+
87+
headers = {
88+
"User-Agent": (
89+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
90+
"AppleWebKit/537.36 (KHTML, like Gecko) "
91+
"Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
92+
),
93+
"Accept": "text/html",
94+
"Accept-Language": "en-US,en;q=0.9",
95+
}
96+
97+
response = requests.get(url, headers=headers, timeout=timeout)
98+
99+
if response.status_code == 200:
100+
content_type = response.headers.get("Content-Type", "")
101+
if "text/html" in content_type:
102+
return response.text
103+
else:
104+
raise requests.RequestException(f"Unexpected Content-Type: {content_type}")
105+
else:
106+
raise requests.RequestException(
107+
f"Failed to fetch {url}, status code: {response.status_code}"
108+
)

docs/templates/docs/search_results.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,11 @@ <h2>{% translate "No search query given" %}</h2>
4343
{% for result in page.object_list %}
4444
<dt>
4545
<h2 class="result-title">
46-
<a href="{% url 'document-detail' lang=result.release.lang version=result.release.version url=result.path host 'docs' %}{% if not start_sel in result.headline %}{{ result.highlight|fragment }}{% endif %}">{{ result.headline|safe }}</a>
46+
<a href="{{ result.get_absolute_url }}{% if not start_sel in result.headline %}{{ result.highlight|fragment }}{% endif %}">{{ result.headline|safe }}</a>
4747
</h2>
4848
<span class="meta breadcrumbs">
4949
{% for breadcrumb in result.breadcrumbs %}
50-
<a href="{% url 'document-detail' lang=result.release.lang version=result.release.version url=breadcrumb.path host 'docs' %}">{{ breadcrumb.title }}</a>{% if not forloop.last %} <span class="arrow">»</span>{% endif %}
50+
<a href="{{ result.get_absolute_url }}">{{ breadcrumb.title }}</a>{% if not forloop.last %} <span class="arrow">»</span>{% endif %}
5151
{% endfor %}
5252
</span>
5353
</dt>
@@ -60,7 +60,7 @@ <h2 class="result-title">
6060
<ul class="code-links">
6161
{% for name, value in result_code_links.items %}
6262
<li>
63-
<a href="{% url 'document-detail' lang=result.release.lang version=result.release.version url=result.path host 'docs' %}#{{ value.full_path }}">
63+
<a href="{{ result.get_absolute_url }}#{{ value.full_path }}">
6464
<div>
6565
<code>{{ name }}</code>
6666
{% if value.module_path %}<div class="meta">{{ value.module_path }}</div>{% endif %}

0 commit comments

Comments
 (0)