Currently I’m trying to use segmentation to insure words are detected even when they are included in sentences with out spaces like “ilikecats” but currently it doesn’t detect words unless they are said by themselves
def normalize_text(text):
# Normalize Unicode characters (e.g., accent removal)
normalized = unicodedata.normalize('NFD', text)
ascii_text = ''.join(char for char in normalized if unicodedata.category(char) != 'Mn')
# Generate all possible substitutions based on the mapping
possible_texts = [ascii_text] # Start with the original text as one possibility
for symbol, replacements in lookalike_mappings.items():
new_texts = []
for t in possible_texts:
if symbol in t: # Only apply replacements if symbol is in the text
for replacement in replacements:
new_texts.append(t.replace(symbol, replacement))
else:
new_texts.append(t) # Keep the original text as it is
possible_texts = new_texts # Update possible_texts with new variations
return possible_texts
def segment_words(text, word_dict):
"""Segment a string of concatenated words into individual words using a dictionary."""
# Normalize the text first
normalized_texts = normalize_text(text)
segments = []
for normalized_text in normalized_texts:
n = len(normalized_text)
memo = [-1] * (n + 1) # To memoize the positions of valid words
# Dynamic programming to break the text into valid words
def find_segments(start):
# If we've reached the end of the text, return True (end of recursion)
if start == n:
return True
# If already computed, use the memoized result
if memo[start] != -1:
return memo[start] == 1
for end in range(start + 1, n + 1):
# Check if the substring normalized_text[start:end] is a valid word in the Trie
if triedict.search(normalized_text[start:end]):
# If valid, proceed to the next segment
if find_segments(end):
segments.append(normalized_text[start:end]) # Add the valid word
memo[start] = 1 # Mark as valid
return True
memo[start] = 0 # Mark as invalid position
return False
# Start the segmentation from the beginning of the normalized text
find_segments(0)
return segments
here is the code I’m attempting to use for normalization and segmentation of words