1+ """
2+ Script to update the custom dictionary 'main.txt' with new words from a given .po file.
3+
4+ The script scans a specified .po file, ignoring certain metadata lines (e.g., lines starting with "#:").
5+ It extracts all unique Greek and English words, compares them against the custom dictionary
6+ under the 'dictionaries/' directory (sibling to the 'scripts/' directory), and adds any new words in alphabetical order.
7+ """
8+
9+ import sys
10+ import os
11+ import re
12+
13+ def scan_and_update (file_path ):
14+ """
15+ Scan the given .po file, extract words, and update the main dictionary.
16+
17+ If the dictionary does not exist, it creates a new one.
18+
19+ Args:
20+ file_path (str): Path to the .po file.
21+
22+ Returns:
23+ int: The number of new words added to the dictionary.
24+ """
25+ # Define the path to the main.txt file relative to the script's location
26+ script_dir = os .path .dirname (os .path .abspath (__file__ ))
27+ # Navigate to the parent directory of scripts/ and then to dictionaries/
28+ dictionaries_dir = os .path .abspath (os .path .join (script_dir , ".." , "dictionaries" ))
29+ dictionary_path = os .path .join (dictionaries_dir , "main.txt" )
30+
31+ # Step 1: Ensure the dictionaries directory exists
32+ os .makedirs (dictionaries_dir , exist_ok = True )
33+
34+ # Step 2: Read and sort the existing dictionary
35+ try :
36+ with open (dictionary_path , 'r' , encoding = 'utf-8' ) as dict_file :
37+ dictionary = set (line .strip ().lower () for line in dict_file if line .strip ())
38+ except FileNotFoundError :
39+ print (f"Dictionary file not found at { dictionary_path } . Creating a new one." )
40+ dictionary = set ()
41+
42+ # Step 3: Open the input .po file
43+ try :
44+ with open (file_path , 'r' , encoding = 'utf-8' ) as input_file :
45+ lines = input_file .readlines ()
46+ except FileNotFoundError :
47+ print (f"Input file { file_path } not found." )
48+ return 0
49+
50+ # Regular expression to ignore metadata lines like #: reference/executionmodel.rst:145
51+ ignore_pattern = re .compile (r"^#:" )
52+
53+ # Regular expression to include accented Greek letters
54+ word_pattern = re .compile (r'\b[a-zA-Zα-ωά-ώΑ-ΩΆ-Ώ]+\b' , re .UNICODE )
55+
56+ new_words = set ()
57+ entry_buffer = []
58+ collecting_msgstr = False
59+
60+ # Step 4: Extract words from the .po file
61+ for line in lines :
62+ if ignore_pattern .match (line ):
63+ continue # Ignore metadata lines
64+
65+ # Handle msgstr entries
66+ if line .startswith ("msgstr" ):
67+ collecting_msgstr = True
68+ # Extract the content after 'msgstr' and remove surrounding quotes
69+ msgstr_content = line .strip ().partition ('msgstr' )[2 ].strip ().strip ('"' )
70+ if msgstr_content :
71+ entry_buffer .append (msgstr_content )
72+ elif collecting_msgstr :
73+ if line .strip () == "" or not line .startswith ('"' ):
74+ # End of msgstr block
75+ collecting_msgstr = False
76+ if entry_buffer :
77+ full_text = " " .join (entry_buffer )
78+ words = word_pattern .findall (full_text )
79+ # Add unique new words in lowercase
80+ new_words .update (word .lower () for word in words if word .lower () not in dictionary )
81+ entry_buffer = []
82+ else :
83+ # Continue collecting multiline msgstr
84+ # Remove surrounding quotes and append
85+ entry_buffer .append (line .strip ().strip ('"' ))
86+
87+ # Handle any remaining buffered text after the loop
88+ if collecting_msgstr and entry_buffer :
89+ full_text = " " .join (entry_buffer )
90+ words = word_pattern .findall (full_text )
91+ new_words .update (word .lower () for word in words if word .lower () not in dictionary )
92+
93+ # Step 5: Update the dictionary with new words
94+ if new_words :
95+ dictionary .update (new_words )
96+ # Sort and write back to the dictionary file
97+ sorted_dictionary = sorted (dictionary )
98+ with open (dictionary_path , 'w' , encoding = 'utf-8' ) as dict_file :
99+ dict_file .write ("\n " .join (sorted_dictionary ))
100+ print (f"Added { len (new_words )} new word{ 's' if len (new_words ) != 1 else '' } to the dictionary." )
101+ else :
102+ print ("No new words to add to the dictionary." )
103+
104+ # Return the count of new words added
105+ return len (new_words )
106+
107+ if __name__ == "__main__" :
108+ # Check if the script received the correct number of arguments
109+ if len (sys .argv ) != 2 :
110+ print ("Usage: python add_to_dictionary.py <file_path>" )
111+ else :
112+ file_path = sys .argv [1 ]
113+ # Validate that the provided path is a file
114+ if not os .path .isfile (file_path ):
115+ print (f"The provided path '{ file_path } ' is not a valid file." )
116+ sys .exit (1 )
117+ # Process the input file and update the dictionary
118+ new_word_count = scan_and_update (file_path )
0 commit comments