@@ -38,7 +38,7 @@ def read_metadata(text):
3838 for part in parts :
3939 parts = part .split (":" )
4040 if len (parts ) == 2 :
41- if parts [0 ] in ['title' , 'description' , 'slug' , 'keywords' , 'score' ]:
41+ if parts [0 ] in ['title' , 'description' , 'slug' , 'keywords' , 'score' , 'doc_type' ]:
4242 metadata [parts [0 ]] = int (parts [1 ].strip ()) if parts [0 ] == 'score' else parts [1 ].strip ()
4343 return metadata
4444
@@ -215,12 +215,12 @@ def update_page_links(directory, base_directory, page_path, url, content, base_u
215215 c_page = os .path .abspath (os .path .join (os .path .dirname (page_path ), './' + target ))
216216 metadata , _ = parse_metadata_and_content (directory , base_directory , c_page , log_snippet_failure = False )
217217 if 'slug' in metadata :
218- link_data .append ((url , f' { base_url } { metadata .get ('slug' )} ' ))
218+ link_data .append ((url , f" { base_url } { metadata .get ('slug' )} " ))
219219 else :
220220 fail = True
221221 elif target .startswith ('/' ): # ignore external links
222222 target = target .removesuffix ('/' )
223- link_data .append ((url , f' { base_url } { target } ' ))
223+ link_data .append ((url , f" { base_url } { target } " ))
224224 if fail :
225225 print (f"Warning: couldn't resolve link for { page_path } " )
226226
@@ -248,7 +248,8 @@ def parse_markdown_content(metadata, content, base_url):
248248 'lvl0' : current_h1 ,
249249 'lvl1' : current_h1
250250 },
251- 'score' : metadata .get ('score' , 0 )
251+ 'score' : metadata .get ('score' , 0 ),
252+ 'doc_type' : metadata .get ('doc_type' , '' )
252253 }
253254 for line in lines :
254255 if line .startswith ('# ' ):
@@ -266,8 +267,7 @@ def parse_markdown_content(metadata, content, base_url):
266267 current_subdoc ['type' ] = 'lvl1'
267268 current_subdoc ['object_id' ] = custom_slugify (heading_slug )
268269 current_subdoc ['hierarchy' ]['lvl1' ] = current_h1
269- current_subdoc ['hierarchy' ]['lvl0' ] = current_h1 if metadata .get ('title' , '' ) == '' else metadata .get (
270- 'title' , '' )
270+ current_subdoc ['hierarchy' ]['lvl0' ] = current_h1 if metadata .get ('title' , '' ) == '' else metadata .get ('title' , '' )
271271 elif line .startswith ('## ' ):
272272 if current_subdoc :
273273 yield from split_large_document (current_subdoc )
@@ -293,7 +293,8 @@ def parse_markdown_content(metadata, content, base_url):
293293 'lvl0' : current_h1 if metadata .get ('title' , '' ) == '' else metadata .get ('title' , '' ),
294294 'lvl1' : current_h1 ,
295295 'lvl2' : current_h2 ,
296- }
296+ },
297+ 'doc_type' : metadata .get ('doc_type' , '' )
297298 }
298299 elif line .startswith ('### ' ):
299300 # note we send users to the h2 or h1 even on ###
@@ -322,7 +323,8 @@ def parse_markdown_content(metadata, content, base_url):
322323 'lvl1' : current_h1 ,
323324 'lvl2' : current_h2 ,
324325 'lvl3' : current_h3 ,
325- }
326+ },
327+ 'doc_type' : metadata .get ('doc_type' , '' )
326328 }
327329 elif line .startswith ('#### ' ):
328330 if current_subdoc :
@@ -348,7 +350,8 @@ def parse_markdown_content(metadata, content, base_url):
348350 'lvl2' : current_h2 ,
349351 'lvl3' : current_h3 ,
350352 'lvl4' : current_h4 ,
351- }
353+ },
354+ 'doc_type' : metadata .get ('doc_type' , '' )
352355 }
353356 elif current_subdoc :
354357 current_subdoc ['content' ] += line + '\n '
@@ -410,9 +413,9 @@ def compute_page_rank(link_data, damping_factor=0.85, max_iter=100, tol=1e-6):
410413def create_new_index (client , index_name ):
411414 try :
412415 client .delete_index (index_name )
413- print (f' Temporary index \ '{ index_name } \ ' deleted successfully.' )
416+ print (f" Temporary index '{ index_name } ' deleted successfully." )
414417 except :
415- print (f' Temporary index \ '{ index_name } \ ' does not exist or could not be deleted' )
418+ print (f" Temporary index '{ index_name } ' does not exist or could not be deleted" )
416419 client .set_settings (index_name , settings ['settings' ])
417420 client .save_rules (index_name , settings ['rules' ])
418421 print (f"Settings applied to temporary index '{ index_name } '." )
@@ -442,9 +445,19 @@ def main(base_directory, algolia_app_id, algolia_api_key, algolia_index_name,
442445 else :
443446 for d in batch :
444447 print (f"{ d ['url' ]} - { d ['page_rank' ]} " )
445- print (f'{ 'processed' if dry_run else 'indexed' } { len (batch )} records' )
448+ # Print a sample record to verify doc_type is included
449+ if batch :
450+ print ("\n --- Sample record ---" )
451+ sample_record = batch [0 ]
452+ print (f"Title: { sample_record .get ('title' , 'N/A' )} " )
453+ print (f"URL: { sample_record .get ('url' , 'N/A' )} " )
454+ print (f"Type: { sample_record .get ('type' , 'N/A' )} " )
455+ print (f"Doc Type: { sample_record .get ('doc_type' , 'N/A' )} " )
456+ print (f"Keywords: { sample_record .get ('keywords' , 'N/A' )} " )
457+ print ("--- End sample ---\n " )
458+ print (f"{ 'processed' if dry_run else 'indexed' } { len (batch )} records" )
446459 t += len (batch )
447- print (f' total { 'processed' if dry_run else 'indexed' } { t } records' )
460+ print (f" total { 'processed' if dry_run else 'indexed' } { t } records" )
448461 if not dry_run :
449462 print ('switching temporary index...' , end = '' )
450463 client .operation_index (temp_index_name , {"operation" : "move" , "destination" : algolia_index_name })
@@ -471,4 +484,4 @@ def main(base_directory, algolia_app_id, algolia_api_key, algolia_index_name,
471484 if args .dry_run :
472485 print ('Dry running, not sending results to Algolia.' )
473486 main (args .base_directory , args .algolia_app_id , args .algolia_api_key , args .algolia_index_name ,
474- dry_run = args .dry_run )
487+ dry_run = args .dry_run )
0 commit comments