Skip to content

Commit 75c85a6

Browse files
authored
refactor: unify query building logic for merge suggestions [CM-765] (#3581)
1 parent 6b0cc88 commit 75c85a6

File tree

7 files changed

+261
-266
lines changed

7 files changed

+261
-266
lines changed

services/apps/merge_suggestions_worker/src/activities/common.ts

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,11 @@ import {
1111
ILLMConsumableMember,
1212
ILLMConsumableOrganization,
1313
ILLMSuggestionVerdict,
14-
PlatformType,
1514
} from '@crowd/types'
1615

1716
import { svc } from '../main'
1817
import { ILLMResult } from '../types'
1918

20-
export const EMAIL_AS_USERNAME_PLATFORMS = [
21-
PlatformType.GIT,
22-
PlatformType.JIRA,
23-
PlatformType.CONFLUENCE,
24-
PlatformType.GERRIT,
25-
]
26-
2719
export async function getAllTenants(): Promise<ITenant[]> {
2820
const tenantRepository = new TenantRepository(svc.postgres.writer.connection(), svc.log)
2921
const tenants = await tenantRepository.getAllTenants()

services/apps/merge_suggestions_worker/src/activities/memberMergeSuggestions.ts

Lines changed: 144 additions & 158 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,22 @@ import { buildFullMemberForMergeSuggestions } from '@crowd/opensearch'
99
import {
1010
ILLMConsumableMember,
1111
IMemberBaseForMergeSuggestions,
12+
IMemberIdentity,
1213
IMemberMergeSuggestion,
1314
MemberIdentityType,
1415
MemberMergeSuggestionTable,
1516
OpenSearchIndex,
16-
PlatformType,
1717
} from '@crowd/types'
1818

19+
import { EMAIL_AS_USERNAME_PLATFORMS } from '../enums'
1920
import { svc } from '../main'
2021
import MemberSimilarityCalculator from '../memberSimilarityCalculator'
21-
import { ISimilarMemberOpensearchResult, ISimilarityFilter } from '../types'
22-
import { chunkArray } from '../utils'
23-
24-
import { EMAIL_AS_USERNAME_PLATFORMS } from './common'
22+
import {
23+
ISimilarMemberOpensearchResult,
24+
ISimilarityFilter,
25+
OpenSearchQueryClauseBuilder,
26+
} from '../types'
27+
import { chunkArray, isEmailAsUsernamePlatform, isNumeric, stripProtocol } from '../utils'
2528

2629
/**
2730
* Finds similar members of given member in a tenant
@@ -74,180 +77,163 @@ export async function getMemberMergeSuggestions(
7477
const unverifiedEmailUsernameMatches = []
7578
const unverifiedUsernameEmailMatches = []
7679

77-
// Process up to 100 identities
80+
// Process up to 75 identities
7881
// This is a safety limit to prevent OpenSearch max clause errors
79-
for (const identity of identities.slice(0, 100)) {
80-
if (identity.value && identity.value.length > 0) {
81-
if (identity.verified) {
82-
// Verified identities: exact match on unverified identities
83-
verifiedExactMatches.push({
84-
value: identity.value,
85-
platform: identity.platform,
86-
})
87-
88-
// Email-as-username: verified email matching unverified username
89-
if (identity.type === MemberIdentityType.EMAIL) {
90-
verifiedEmailUsernameMatches.push({
91-
value: identity.value,
92-
})
82+
for (const { verified, value, platform, type } of identities.slice(0, 75)) {
83+
const isEmail = type === MemberIdentityType.EMAIL
84+
const isUsername = type === MemberIdentityType.USERNAME
85+
const isEmailAsUsername = isUsername && isEmailAsUsernamePlatform(platform)
86+
87+
const targetLists = verified
88+
? {
89+
exact: verifiedExactMatches,
90+
emailUsername: verifiedEmailUsernameMatches,
91+
usernameEmail: verifiedUsernameEmailMatches,
92+
fuzzy: verifiedFuzzyMatches,
9393
}
94-
95-
// Email-as-username: verified username matching unverified email
96-
if (
97-
identity.type === MemberIdentityType.USERNAME &&
98-
EMAIL_AS_USERNAME_PLATFORMS.includes(identity.platform as PlatformType)
99-
) {
100-
verifiedUsernameEmailMatches.push({
101-
value: identity.value,
102-
})
94+
: {
95+
exact: unverifiedExactMatches,
96+
emailUsername: unverifiedEmailUsernameMatches,
97+
usernameEmail: unverifiedUsernameEmailMatches,
10398
}
10499

105-
// Fuzzy search for verified identities (non-numeric only)
106-
if (Number.isNaN(Number(identity.value))) {
107-
const cleanedIdentityName = identity.value.replace(/^https?:\/\//, '')
108-
verifiedFuzzyMatches.push({
109-
value: identity.value,
110-
cleanedValue: cleanedIdentityName,
111-
})
112-
}
113-
} else {
114-
// Unverified identities: exact match on verified identities
115-
unverifiedExactMatches.push({
116-
value: identity.value,
117-
platform: identity.platform,
118-
})
100+
// Exact matches
101+
targetLists.exact.push({ value, platform })
119102

120-
// Email-as-username: unverified email matching verified username
121-
if (identity.type === MemberIdentityType.EMAIL) {
122-
unverifiedEmailUsernameMatches.push({
123-
value: identity.value,
124-
})
125-
}
103+
// Email-as-username cases
104+
if (isEmail) {
105+
targetLists.emailUsername.push({ value })
106+
} else if (isEmailAsUsername) {
107+
targetLists.usernameEmail.push({ value })
108+
}
126109

127-
// Email-as-username: unverified username matching verified email
128-
if (
129-
identity.type === MemberIdentityType.USERNAME &&
130-
EMAIL_AS_USERNAME_PLATFORMS.includes(identity.platform as PlatformType)
131-
) {
132-
unverifiedUsernameEmailMatches.push({
133-
value: identity.value,
134-
})
135-
}
136-
}
110+
// Fuzzy matches (only for verified & non-numeric)
111+
if (verified && !isNumeric(value)) {
112+
targetLists.fuzzy.push({ value: stripProtocol(value) })
137113
}
138114
}
139115

140116
// Build OpenSearch query clauses
141117
const identitiesShould = []
142-
const CHUNK_SIZE = 20 // Split queries into chunks to avoid OpenSearch limits
143-
144-
// Query 1: Verified -> Unverified exact matches
145-
for (const { value, platform } of verifiedExactMatches) {
146-
identitiesShould.push({
147-
bool: {
148-
must: [
149-
{ term: { [`nested_identities.keyword_value`]: value } },
150-
{ match: { [`nested_identities.string_platform`]: platform } },
151-
{ term: { [`nested_identities.bool_verified`]: false } },
152-
],
153-
},
154-
})
155-
}
156-
157-
// Query 2: Verified email -> Unverified username (email-as-username platforms)
158-
for (const { value } of verifiedEmailUsernameMatches) {
159-
identitiesShould.push({
160-
bool: {
161-
must: [
162-
{ term: { [`nested_identities.keyword_value`]: value } },
163-
{ terms: { [`nested_identities.string_platform`]: EMAIL_AS_USERNAME_PLATFORMS } },
164-
{ term: { [`nested_identities.keyword_type`]: MemberIdentityType.USERNAME } },
165-
{ term: { [`nested_identities.bool_verified`]: false } },
166-
],
167-
},
168-
})
169-
}
170-
171-
// Query 3: Verified username -> Unverified email (email-as-username platforms)
172-
for (const { value } of verifiedUsernameEmailMatches) {
173-
identitiesShould.push({
174-
bool: {
175-
must: [
176-
{ term: { [`nested_identities.keyword_value`]: value } },
177-
{ term: { [`nested_identities.keyword_type`]: MemberIdentityType.EMAIL } },
178-
{ term: { [`nested_identities.bool_verified`]: false } },
179-
],
180-
},
181-
})
182-
}
183-
184-
// Query 4: Verified -> Verified fuzzy matches (chunked)
185-
if (verifiedFuzzyMatches.length > 0) {
186-
const uniqueFuzzyValues = [
187-
...new Set(verifiedFuzzyMatches.map(({ cleanedValue }) => cleanedValue)),
188-
]
189-
const fuzzyChunks = chunkArray(uniqueFuzzyValues, CHUNK_SIZE)
118+
const CHUNK_SIZE = 15 // Split queries into chunks to avoid OpenSearch limits
190119

191-
for (const chunk of fuzzyChunks) {
192-
const fuzzyShouldClauses = chunk.map((cleanedValue) => ({
120+
const clauseBuilders: OpenSearchQueryClauseBuilder<Partial<IMemberIdentity>>[] = [
121+
{
122+
// Query 1: Verified -> Unverified exact matches
123+
matches: verifiedExactMatches,
124+
builder: ({ value, platform }) => ({
125+
bool: {
126+
must: [
127+
{ term: { [`nested_identities.keyword_value`]: value } },
128+
{ match: { [`nested_identities.string_platform`]: platform } },
129+
{ term: { [`nested_identities.bool_verified`]: false } },
130+
],
131+
},
132+
}),
133+
},
134+
{
135+
// Query 2: Verified email -> Unverified username (email-as-username platforms)
136+
matches: verifiedEmailUsernameMatches,
137+
builder: ({ value }) => ({
138+
bool: {
139+
must: [
140+
{ term: { [`nested_identities.keyword_value`]: value } },
141+
{ terms: { [`nested_identities.string_platform`]: EMAIL_AS_USERNAME_PLATFORMS } },
142+
{ term: { [`nested_identities.keyword_type`]: MemberIdentityType.USERNAME } },
143+
{ term: { [`nested_identities.bool_verified`]: false } },
144+
],
145+
},
146+
}),
147+
},
148+
{
149+
// Query 3: Verified username -> Unverified email (email-as-username platforms)
150+
matches: verifiedUsernameEmailMatches,
151+
builder: ({ value }) => ({
152+
bool: {
153+
must: [
154+
{ term: { [`nested_identities.keyword_value`]: value } },
155+
{ term: { [`nested_identities.keyword_type`]: MemberIdentityType.EMAIL } },
156+
{ term: { [`nested_identities.bool_verified`]: false } },
157+
],
158+
},
159+
}),
160+
},
161+
{
162+
// Query 5: Unverified -> Verified exact matches
163+
matches: unverifiedExactMatches,
164+
builder: ({ value, platform }) => ({
165+
bool: {
166+
must: [
167+
{ term: { [`nested_identities.keyword_value`]: value } },
168+
{ match: { [`nested_identities.string_platform`]: platform } },
169+
{ term: { [`nested_identities.bool_verified`]: true } },
170+
],
171+
},
172+
}),
173+
},
174+
{
175+
// Query 6: Unverified email -> Verified username (email-as-username platforms)
176+
matches: unverifiedEmailUsernameMatches,
177+
builder: ({ value }) => ({
178+
bool: {
179+
must: [
180+
{ term: { [`nested_identities.keyword_value`]: value } },
181+
{ terms: { [`nested_identities.string_platform`]: EMAIL_AS_USERNAME_PLATFORMS } },
182+
{ term: { [`nested_identities.keyword_type`]: MemberIdentityType.USERNAME } },
183+
{ term: { [`nested_identities.bool_verified`]: true } },
184+
],
185+
},
186+
}),
187+
},
188+
{
189+
// Query 7: Unverified username -> Verified email (email-as-username platforms)
190+
matches: unverifiedUsernameEmailMatches,
191+
builder: ({ value }) => ({
192+
bool: {
193+
must: [
194+
{ term: { [`nested_identities.keyword_value`]: value } },
195+
{ term: { [`nested_identities.keyword_type`]: MemberIdentityType.EMAIL } },
196+
{ term: { [`nested_identities.bool_verified`]: true } },
197+
],
198+
},
199+
}),
200+
},
201+
{
202+
// Query 4: Verified -> Verified fuzzy matches
203+
matches: uniqBy(verifiedFuzzyMatches, 'value'),
204+
builder: ({ value }) => ({
193205
match: {
194206
[`nested_identities.string_value`]: {
195-
query: cleanedValue,
207+
query: value,
196208
prefix_length: 1,
197209
fuzziness: 'auto',
198210
},
199211
},
200-
}))
201-
202-
identitiesShould.push({
203-
bool: {
204-
should: fuzzyShouldClauses,
205-
minimum_should_match: 1,
206-
},
207-
})
212+
}),
213+
filter: [{ term: { [`nested_identities.bool_verified`]: true } }],
214+
},
215+
]
216+
217+
for (const clauseBuilder of clauseBuilders) {
218+
const { matches, builder, filter } = clauseBuilder
219+
if (matches.length > 0) {
220+
const chunks = chunkArray(matches, CHUNK_SIZE)
221+
for (const chunk of chunks) {
222+
const shouldClauses = chunk.map(builder)
223+
const chunkQuery: any = {
224+
bool: {
225+
should: shouldClauses,
226+
minimum_should_match: 1,
227+
},
228+
}
229+
if (filter) {
230+
chunkQuery.bool.filter = filter
231+
}
232+
identitiesShould.push(chunkQuery)
233+
}
208234
}
209235
}
210236

211-
// Query 5: Unverified -> Verified exact matches
212-
for (const { value, platform } of unverifiedExactMatches) {
213-
identitiesShould.push({
214-
bool: {
215-
must: [
216-
{ term: { [`nested_identities.keyword_value`]: value } },
217-
{ match: { [`nested_identities.string_platform`]: platform } },
218-
{ term: { [`nested_identities.bool_verified`]: true } },
219-
],
220-
},
221-
})
222-
}
223-
224-
// Query 6: Unverified email -> Verified username (email-as-username platforms)
225-
for (const { value } of unverifiedEmailUsernameMatches) {
226-
identitiesShould.push({
227-
bool: {
228-
must: [
229-
{ term: { [`nested_identities.keyword_value`]: value } },
230-
{ terms: { [`nested_identities.string_platform`]: EMAIL_AS_USERNAME_PLATFORMS } },
231-
{ term: { [`nested_identities.keyword_type`]: MemberIdentityType.USERNAME } },
232-
{ term: { [`nested_identities.bool_verified`]: true } },
233-
],
234-
},
235-
})
236-
}
237-
238-
// Query 7: Unverified username -> Verified email (email-as-username platforms)
239-
for (const { value } of unverifiedUsernameEmailMatches) {
240-
identitiesShould.push({
241-
bool: {
242-
must: [
243-
{ term: { [`nested_identities.keyword_value`]: value } },
244-
{ term: { [`nested_identities.keyword_type`]: MemberIdentityType.EMAIL } },
245-
{ term: { [`nested_identities.bool_verified`]: true } },
246-
],
247-
},
248-
})
249-
}
250-
251237
// Wrap all identity queries in a nested query (identities are nested documents)
252238
const nestedIdentityQuery = {
253239
nested: {

0 commit comments

Comments
 (0)