Skip to content

Commit 53ae81c

Browse files
committed
Arabic Presentation Form Normalizer
1 parent 7f58483 commit 53ae81c

File tree

11 files changed

+471
-6
lines changed

11 files changed

+471
-6
lines changed

.pre-commit-hooks.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,10 @@
55
language: python
66
types: [text]
77
stages: [commit, push, manual]
8+
- id: arabic-presentation-form
9+
name: Arabic Presentation Form Normalizer
10+
description: Replaces Arabic Presentation for and other contextual forms to default.
11+
entry: arabic-presentation-form
12+
language: python
13+
types: [text]
14+
stages: [commit, push, manual]

.vscode/launch.json

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,22 @@
66
"configurations": [
77
{
88
"name": "pyModule",
9-
"type": "python",
9+
"type": "debugpy",
1010
"request": "launch",
1111
"module": "pre_commit_hooks.check_header_footer",
1212
"console": "integratedTerminal",
1313
"justMyCode": true
1414
},
1515
{
1616
"name": "pytest",
17-
"type": "python",
17+
"type": "debugpy",
1818
"request": "launch",
1919
"module": "pytest",
2020
"console": "integratedTerminal",
2121
"args": [
2222
"--no-cov", // disable as it affects breakpoints
23-
"-vv", "-k",
23+
"-vv",
24+
"-k",
2425
"" // add test function name here
2526
],
2627
"justMyCode": true

.vscode/settings.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
"editor.wordBasedSuggestions": "off",
44
"editor.defaultFormatter": "charliermarsh.ruff",
55
"editor.codeActionsOnSave": {
6-
"source.fixAll": "explicit",
7-
"source.organizeImports": "explicit"
6+
"source.fixAll": "always",
7+
"source.organizeImports": "always"
88
}
99
}
1010
}

README.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,35 @@ repos:
9595
Future work:
9696
1. Support a year parameter that can be used to apply fixes.
9797
98+
99+
### arabic-presentation-form
100+
101+
Replace characters in Arabic Presentation form (A or B), and convert them into 'default' unicode characters.
102+
One application is when using the 'Scheherazade New' font, and it does not support these characters.
103+
104+
Arguments:
105+
- `--excluded-chars`: Regex of characters to exclude from being fixed.
106+
- `--custom-rules`: Rules to update or override the tools inbuilt configuration. Format and example below:
107+
```json
108+
"RuleName": {"rule": {"ReplacementCharacter(s)": "RegexOfApplicableCharacter(s)"}}
109+
"ʾalif": {"rule": {"\u0627": "(\ufe8d|\ufe8e)"}},
110+
```
111+
112+
Example where we are extending the applicable file types and using a specific folder (all subfolders under `site/data`)
113+
114+
```yaml
115+
repos:
116+
- repo: https://github.com/adehad/pre-commit-hooks
117+
rev: main
118+
hooks:
119+
- id: arabic-presentation-form
120+
entry: arabic-presentation-form
121+
language: python
122+
types_or: [text, json, markdown]
123+
args: [--excluded-chars, (ﷺ)]
124+
files: ^site/data/
125+
```
126+
98127
## Local Installation
99128

100129
```console

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ Source = "https://github.com/adehad/pre-commit-hooks/"
3030

3131
[project.scripts]
3232
check-header-footer = "pre_commit_hooks.check_header_footer:main"
33+
arabic-presentation-form = "pre_commit_hooks.arabic_presentation_form:main"
3334

3435
[tool.hatch.build]
3536
sources = ["src"]
@@ -72,7 +73,8 @@ python = ["38", "39", "310", "311"]
7273
# External Tool Config
7374
########################################################################################
7475
[tool.mypy]
75-
python_version = 3.8
76+
python_version = '3.8'
77+
strict = true
7678
ignore_missing_imports = true
7779
namespace_packages = true
7880
show_error_codes = true
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
"""Arabic Presentation Form."""
2+
3+
from __future__ import annotations
4+
5+
import functools
6+
import pathlib
7+
import re
8+
import sys
9+
from typing import Any, Dict, Sequence
10+
11+
from ..util import (
12+
ABCArgs,
13+
ABCHook,
14+
ExitCode,
15+
HashableDict,
16+
load_json_source,
17+
)
18+
from . import char_map
19+
20+
sys.stdout.reconfigure(encoding="utf-8") # For Windows: we want to be sure to use UTF-8
21+
RulesDict = Dict[re.Pattern[Any], str]
22+
23+
24+
def apply_rules_to_lines(
25+
line: str,
26+
rules: RulesDict,
27+
exclude: re.Pattern,
28+
file_name: str,
29+
line_no: str,
30+
) -> tuple[ExitCode, str]:
31+
"""Check the text for rules.
32+
33+
Args:
34+
line (str): Line to check the rules.
35+
rules (RulesDict): The rules to check form.
36+
exclude (re.Pattern): characters to exclude from check.
37+
file_name (str): the name of the file being checked.
38+
line_no (int): The line number being checked.
39+
40+
Returns:
41+
(ExitCode, str): (The PASS/FAIL state, The new line).
42+
"""
43+
exit_code = ExitCode.OK
44+
new_line = exclude.sub(" ", line) # Replace with space to not affect col numbers
45+
46+
if not char_map.is_contains_non_general_form(max(new_line)):
47+
return exit_code, line
48+
49+
new_chars: list[str] = []
50+
exit_code = ExitCode.FAIL
51+
52+
for col_no, c in enumerate(line, start=1):
53+
new_c = apply_rule(rules=HashableDict(rules), character=c)
54+
new_c_as_unicode_hex = [f"\\u{ord(c):04x}" for c in new_c]
55+
fix_char_loc = (
56+
f"{file_name}:{line_no}:{col_no} [{new_c} ({new_c_as_unicode_hex})]"
57+
)
58+
if c != new_c:
59+
output_str = f"[Fixed] {fix_char_loc}"
60+
elif char_map.is_contains_non_general_form(new_c):
61+
output_str = f"[Not Fixed] {fix_char_loc}"
62+
else:
63+
output_str = ""
64+
65+
if output_str:
66+
print(output_str)
67+
output_str = ""
68+
69+
new_chars.append(new_c)
70+
71+
new_line = "".join(new_chars)
72+
73+
return exit_code, new_line
74+
75+
76+
def get_rules(custom_rules: dict[str, dict[str, str]]) -> RulesDict:
77+
"""Return the rules from a given config string.
78+
79+
Args:
80+
custom_rules (str): Any additional rules to apply.
81+
82+
Returns:
83+
RulesDict: The compiles rules.
84+
85+
"""
86+
regex_rules = {}
87+
complete_rules: char_map.CHAR_MAP_TYPE = {}
88+
complete_rules.update(char_map.CHAR_MAP)
89+
complete_rules.update(custom_rules)
90+
for _rule_name, char_mapping_rule in complete_rules.items():
91+
for expected_out, expected_regex in char_mapping_rule["rule"].items():
92+
regex_rules.update({re.compile(expected_regex): expected_out})
93+
return regex_rules
94+
95+
96+
@functools.lru_cache
97+
def apply_rule(rules: RulesDict, character: str) -> str:
98+
"""Apply the rule from the list of rules to the character.
99+
100+
Args:
101+
rules (RulesDict): rules to apply for the character.
102+
character (str): The letter/character to check against.
103+
104+
Returns:
105+
str: The character after applying any rules.
106+
"""
107+
new_char = character
108+
for reg_pattern, replace_char in rules.items():
109+
if reg_pattern.match(character):
110+
new_char = reg_pattern.sub(replace_char, character)
111+
break
112+
return new_char
113+
114+
115+
class ArabicPresentationFormArgs(ABCArgs):
116+
"""Args."""
117+
118+
excluded_chars: str
119+
custom_rules: char_map.CHAR_MAP_TYPE
120+
121+
122+
class ArabicPresentationFormChecker(ABCHook):
123+
"""Checker for Header and Footer."""
124+
125+
def setup_parser(self) -> None:
126+
"""Custom arguments."""
127+
self.parser.add_argument(
128+
"--excluded-chars",
129+
type=str,
130+
default="",
131+
metavar="exclude-char-regex",
132+
help="Regex for characters to exclude. e.g. (ﷺ)",
133+
)
134+
self.parser.add_argument(
135+
"--custom-rules",
136+
type=load_json_source,
137+
default=dict(),
138+
metavar="Path-OR-JSON-String",
139+
help=(
140+
'"RuleName": {"rule": {"ReplacementCharacter(s)": "RegexOfApplicableCharacter(s)"}}' # noqa: E501
141+
'. e.g. "ʾalif": {"rule": {"\u0627": "(\ufe8d|\ufe8e)"}},' # noqa: RUF001
142+
+ ". To exclude a unicode character, overwrite its default entry."
143+
),
144+
)
145+
146+
def implementation(
147+
self,
148+
file_name: pathlib.Path,
149+
args: ArabicPresentationFormArgs,
150+
) -> ExitCode:
151+
"""Check Implementation."""
152+
exit_code = int(ExitCode.OK)
153+
exclude_regex = re.compile(args.excluded_chars)
154+
155+
new_file_lines = []
156+
with file_name.open("r", encoding="utf-8") as f:
157+
for line_no, line in enumerate(iter(f.readlines()), start=1):
158+
intermediate_exit_code, new_line = apply_rules_to_lines(
159+
line=line,
160+
line_no=line_no,
161+
file_name=file_name,
162+
rules=get_rules(args.custom_rules),
163+
exclude=exclude_regex,
164+
)
165+
exit_code |= intermediate_exit_code
166+
167+
if char_map.is_contains_non_general_form(
168+
max(exclude_regex.sub("", new_line) or " ")
169+
):
170+
print(f"Incomplete Fixes Applied: {file_name}:{line_no}")
171+
172+
new_file_lines.append(new_line)
173+
174+
with file_name.open("w", encoding="utf-8") as f:
175+
f.writelines(new_file_lines)
176+
return ExitCode(exit_code)
177+
178+
179+
def main(argv: Sequence[str] | None = None) -> int:
180+
"""Main entrypoint."""
181+
argparser = ArabicPresentationFormChecker()
182+
return argparser.run(argv=argv)
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
"""Arabic Presentation Form Hook."""
2+
3+
from __future__ import annotations
4+
5+
from . import main
6+
7+
if __name__ == "__main__":
8+
raise SystemExit(main())

0 commit comments

Comments
 (0)