Skip to content

Commit 0128074

Browse files
authored
Added algolia search metrics to release report (boostorg#1889) (boostorg#1914)
1 parent baae84b commit 0128074

File tree

7 files changed

+207
-32
lines changed

7 files changed

+207
-32
lines changed

config/settings.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@
7575
"health_check.db",
7676
"health_check.contrib.celery",
7777
"imagekit",
78+
"django_countries",
7879
# Allows authentication for Mailman
7980
"oauth2_provider",
8081
# Allauth dependencies:
@@ -584,3 +585,9 @@
584585
BOOST_BRANCHES = ["master", "develop"]
585586
OPENROUTER_URL = "https://openrouter.ai/api/v1"
586587
OPENROUTER_API_KEY = env("OPENROUTER_API_KEY")
588+
589+
ALGOLIA = {
590+
"app_id": env("ALGOLIA_APP_ID", None),
591+
"api_key": env("ALGOLIA_API_KEY", None),
592+
"region": env("ALGOLIA_APP_REGION", "us"),
593+
}

libraries/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,3 +365,4 @@
365365
DEVELOP_RELEASE_URL_PATH_STR = "develop"
366366
MASTER_RELEASE_URL_PATH_STR = "master"
367367
VERSION_SLUG_PREFIX = "boost-"
368+
RELEASE_REPORT_SEARCH_TOP_COUNTRIES_LIMIT = 5

libraries/forms.py

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from structlog import get_logger
12
from functools import cached_property
23
from itertools import groupby, chain
34
from operator import attrgetter
@@ -9,13 +10,20 @@
910
from django.db.models import F, Q, Count, OuterRef, Sum, When, Value, Case
1011
from django.forms import Form, ModelChoiceField, ModelForm, BooleanField
1112

13+
from algoliasearch.analytics.client import AnalyticsClientSync
14+
15+
from config import settings
1216
from core.models import RenderedContent
1317
from reports.generation import (
1418
generate_wordcloud,
1519
get_mailing_list_post_stats,
1620
get_new_subscribers_stats,
21+
generate_mailinglist_words,
22+
generate_algolia_words,
23+
get_algolia_search_stats,
1724
)
1825
from slack.models import Channel, SlackActivityBucket, SlackUser
26+
from versions.exceptions import BoostImportedDataException
1927
from versions.models import Version, ReportConfiguration
2028
from .models import (
2129
Commit,
@@ -29,6 +37,8 @@
2937
from mailing_list.models import EmailData
3038
from .utils import batched, conditional_batched
3139

40+
logger = get_logger(__name__)
41+
3242

3343
class LibraryForm(ModelForm):
3444
class Meta:
@@ -661,9 +671,13 @@ def _get_slack_stats_for_channels(
661671
}
662672

663673
def _get_dependency_data(self, library_order, version):
664-
diffs_by_id = {
665-
x["library_id"]: x for x in version.get_dependency_diffs().values()
666-
}
674+
try:
675+
dependency_diff_values = version.get_dependency_diffs().values()
676+
except BoostImportedDataException as e:
677+
logger.warning(f"Could not get dependency diffs for version {version}: {e}")
678+
dependency_diff_values = {}
679+
680+
diffs_by_id = {x["library_id"]: x for x in dependency_diff_values}
667681
diffs = []
668682
for lib_id in library_order:
669683
diffs.append(diffs_by_id.get(lib_id, {}))
@@ -697,9 +711,13 @@ def get_library_data(self, libraries, library_order, prior_version, version):
697711
def get_stats(self):
698712
report_configuration = self.cleaned_data["report_configuration"]
699713
version = Version.objects.filter(name=report_configuration.version).first()
714+
# NOTE TO FUTURE DEVS: remember to account for the fact that a report
715+
# configuration may not match with a real version in frequent cases where
716+
# reports are generated before the release version has been created.
717+
report_before_release = False if version else True
700718

701719
prior_version = None
702-
if not version:
720+
if report_before_release:
703721
# if the version is not set then the user has chosen a report configuration
704722
# that's not matching a live version, so we use the most recent version
705723
version = Version.objects.filter(name="master").first()
@@ -808,10 +826,25 @@ def get_stats(self):
808826
library in [lib["library"] for lib in library_data],
809827
)
810828
)
811-
wordcloud_base64, wordcloud_top_words = generate_wordcloud(
812-
version, prior_version
829+
# mailinglist word cloud generation
830+
mailinglist_words = generate_mailinglist_words(prior_version, version)
831+
mailinglist_wordcloud_base64, mailinglist_wordcloud_top_words = (
832+
generate_wordcloud(mailinglist_words, width=1400, height=700)
833+
)
834+
835+
# algolia search word cloud generation
836+
client = AnalyticsClientSync(**settings.ALGOLIA)
837+
# if the report is based on a live version, look for stats for that
838+
# version, otherwise use the stats for the prior (live) version
839+
search_version = prior_version if report_before_release else version
840+
search_list_words = generate_algolia_words(client, search_version)
841+
search_wordcloud_base64, search_wordcloud_top_words = generate_wordcloud(
842+
search_list_words, width=800, height=250
813843
)
814844

845+
search_stats = get_algolia_search_stats(client, search_version)
846+
logger.info(f"{search_stats=}")
847+
815848
opened_issues_count = (
816849
Issue.objects.filter(library__in=self.library_queryset)
817850
.opened_during_release(version, prior_version)
@@ -827,20 +860,23 @@ def get_stats(self):
827860
"committee_members": committee_members,
828861
"lines_added": lines_added,
829862
"lines_removed": lines_removed,
830-
"wordcloud_base64": wordcloud_base64,
831-
"wordcloud_frequencies": wordcloud_top_words,
832863
"version": version,
833864
"report_configuration": report_configuration,
834865
"prior_version": prior_version,
835866
"opened_issues_count": opened_issues_count,
836867
"closed_issues_count": closed_issues_count,
868+
"mailinglist_wordcloud_base64": mailinglist_wordcloud_base64,
869+
"mailinglist_wordcloud_frequencies": mailinglist_wordcloud_top_words,
837870
"mailinglist_counts": mailinglist_counts,
838871
"mailinglist_total": total_mailinglist_count or 0,
839872
"mailinglist_contributor_release_count": mailinglist_contributor_release_count, # noqa: E501
840873
"mailinglist_contributor_new_count": mailinglist_contributor_new_count,
841874
"mailinglist_post_stats": mailinglist_post_stats,
842875
"mailinglist_new_subscribers_stats": new_subscribers_stats,
843876
"mailinglist_charts_start_year": prior_version.release_date.year,
877+
"search_wordcloud_base64": search_wordcloud_base64,
878+
"search_wordcloud_frequencies": search_wordcloud_top_words,
879+
"search_stats": search_stats,
844880
"commit_contributors_release_count": commit_contributors_release_count,
845881
"commit_contributors_new_count": commit_contributors_new_count,
846882
"global_contributors_new_count": len(

reports/generation.py

Lines changed: 55 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import base64
22
import io
3+
import json
34
import logging
45
import random
56
from datetime import datetime, timedelta, date
@@ -11,8 +12,10 @@
1112
from django.db.models.functions import ExtractWeek, ExtractIsoYear
1213
from matplotlib import pyplot as plt
1314
from wordcloud import WordCloud, STOPWORDS
15+
from algoliasearch.analytics.client import AnalyticsClientSync
1416

1517
from core.models import SiteSettings
18+
from libraries.constants import RELEASE_REPORT_SEARCH_TOP_COUNTRIES_LIMIT
1619
from libraries.models import WordcloudMergeWord # TODO: move model to this app
1720
from mailing_list.models import PostingData, SubscriptionData
1821
from reports.constants import WORDCLOUD_FONT
@@ -21,14 +24,45 @@
2124
logger = logging.getLogger(__name__)
2225

2326

27+
def generate_mailinglist_words(
28+
prior_version: Version, version: Version
29+
) -> dict[str, int]:
30+
"""Generates word frequencies from mailing list content between two versions."""
31+
word_frequencies = {}
32+
for content in get_mail_content(version, prior_version):
33+
for key, val in WordCloud().process_text(content).items():
34+
if len(key) < 2:
35+
continue
36+
key_lower = key.lower()
37+
if key_lower not in word_frequencies:
38+
word_frequencies[key_lower] = 0
39+
word_frequencies[key_lower] += val
40+
41+
return word_frequencies
42+
43+
44+
def generate_algolia_words(
45+
client: AnalyticsClientSync, version: Version
46+
) -> dict[str, int]:
47+
args = {
48+
"index": version.stripped_boost_url_slug,
49+
"limit": 100,
50+
}
51+
search_results = client.get_top_searches(**args).to_json()
52+
search_data = json.loads(search_results)
53+
return {r["search"]: r["count"] for r in search_data["searches"] if r["count"] > 1}
54+
55+
2456
def generate_wordcloud(
25-
version: Version, prior_version: Version
57+
word_frequencies: dict[str, int], width: int, height: int
2658
) -> tuple[str | None, list]:
2759
"""Generates a wordcloud png and returns it as a base64 string and word frequencies.
2860
2961
Returns:
3062
Tuple of (base64_encoded_png_string, wordcloud_top_words)
3163
"""
64+
if not word_frequencies:
65+
return None, []
3266
font_relative_path = f"font/{WORDCLOUD_FONT}"
3367
font_full_path = finders.find(font_relative_path)
3468

@@ -38,23 +72,11 @@ def generate_wordcloud(
3872
wc = WordCloud(
3973
mode="RGBA",
4074
background_color=None,
41-
width=1400,
42-
height=700,
75+
width=width,
76+
height=height,
4377
stopwords=STOPWORDS | SiteSettings.load().wordcloud_ignore_set,
4478
font_path=font_full_path,
4579
)
46-
word_frequencies = {}
47-
for content in get_mail_content(version, prior_version):
48-
for key, val in wc.process_text(content).items():
49-
if len(key) < 2:
50-
continue
51-
key_lower = key.lower()
52-
if key_lower not in word_frequencies:
53-
word_frequencies[key_lower] = 0
54-
word_frequencies[key_lower] += val
55-
if not word_frequencies:
56-
return None, []
57-
5880
word_frequencies = boost_normalize_words(
5981
word_frequencies,
6082
{x.from_word: x.to_word for x in WordcloudMergeWord.objects.all()},
@@ -181,3 +203,21 @@ def get_new_subscribers_stats(start_date: datetime, end_date: datetime):
181203
current += timedelta(days=7) # hop by weeks
182204

183205
return chart_data
206+
207+
208+
def get_algolia_search_stats(client: AnalyticsClientSync, version: Version) -> dict:
209+
default_args = {"index": version.stripped_boost_url_slug}
210+
# search data
211+
search_response = client.get_searches_count(**default_args).to_json()
212+
search_data = json.loads(search_response)
213+
# country data
214+
country_results = client.get_top_countries(**default_args, limit=100).to_json()
215+
country_data = json.loads(country_results)
216+
country_stats = {r["country"]: r["count"] for r in country_data["countries"]}
217+
return {
218+
"total_searches": search_data.get("count"),
219+
"country_stats": country_stats,
220+
"top_countries": list(country_stats.items())[
221+
:RELEASE_REPORT_SEARCH_TOP_COUNTRIES_LIMIT
222+
],
223+
}

requirements.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ django-allauth
55
django-allauth[socialaccount]
66
django-anymail[mailgun]
77
django-cors-headers
8+
django-countries
89
django-db-geventpool
910
django-extensions
1011
django-health-check
@@ -34,6 +35,7 @@ jsoncomment
3435
unidecode
3536
wordcloud
3637
lxml
38+
algoliasearch
3739
openai
3840

3941
# Logging

0 commit comments

Comments
 (0)