Skip to content

Commit

Permalink
perf: update regex pattern for Vietnamese
Browse files Browse the repository at this point in the history
  • Loading branch information
datvodinh committed May 25, 2024
1 parent 4e9755d commit 1f6bc2c
Showing 1 changed file with 1 addition and 1 deletion.
2 changes: 1 addition & 1 deletion rag_chatbot/core/ingestion/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(self, setting: RAGSettings | None = None) -> None:

def _filter_text(self, text):
# Define the regex pattern.
pattern = r'[a-zA-Z0-9 `~!@#$%^&*()_\-+=\[\]{}|\\;:\'",.<>/?]+'
pattern = r'[a-zA-Z0-9 \u00C0-\u01B0\u1EA0-\u1EF9`~!@#$%^&*()_\-+=\[\]{}|\\;:\'",.<>/?]+'
matches = re.findall(pattern, text)
# Join all matched substrings into a single string
filtered_text = ' '.join(matches)
Expand Down

0 comments on commit 1f6bc2c

Please sign in to comment.