How to build multilingual OCR AI proxy with EasyORC and OpenCV in Python
class AdvancedOCRAgent:
"""
Advanced OCR AI Agent with preprocessing, multi-language support,
and intelligent text extraction capabilities.
"""
def __init__(self, languages: List[str] = ['en'], gpu: bool = True):
"""Initialize OCR agent with specified languages."""
print("🤖 Initializing Advanced OCR Agent...")
self.languages = languages
self.reader = easyocr.Reader(languages, gpu=gpu)
self.confidence_threshold = 0.5
print(f"✅ OCR Agent ready! Languages: {languages}")
def upload_image(self) -> Optional[str]:
"""Upload image file through Colab interface."""
print("📁 Upload your image file:")
uploaded = files.upload()
if uploaded:
filename = list(uploaded.keys())[0]
print(f"✅ Uploaded: {filename}")
return filename
return None
def preprocess_image(self, image: np.ndarray, enhance: bool = True) -> np.ndarray:
"""Advanced image preprocessing for better OCR accuracy."""
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = image.copy()
if enhance:
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
gray = clahe.apply(gray)
gray = cv2.fastNlMeansDenoising(gray)
kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
gray = cv2.filter2D(gray, -1, kernel)
binary = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
)
return binary
def extract_text(self, image_path: str, preprocess: bool = True) -> Dict:
"""Extract text from image with advanced processing."""
print(f"🔍 Processing image: {image_path}")
image = cv2.imread(image_path)
if image is None:
raise ValueError(f"Could not load image: {image_path}")
if preprocess:
processed_image = self.preprocess_image(image)
else:
processed_image = image
results = self.reader.readtext(processed_image)
extracted_data = {
'raw_results': results,
'filtered_results': [],
'full_text': '',
'confidence_stats': {},
'word_count': 0,
'line_count': 0
}
high_confidence_text = []
confidences = []
for (bbox, text, confidence) in results:
if confidence >= self.confidence_threshold:
extracted_data['filtered_results'].append({
'text': text,
'confidence': confidence,
'bbox': bbox
})
high_confidence_text.append(text)
confidences.append(confidence)
extracted_data['full_text'] = ' '.join(high_confidence_text)
extracted_data['word_count'] = len(extracted_data['full_text'].split())
extracted_data['line_count'] = len(high_confidence_text)
if confidences:
extracted_data['confidence_stats'] = {
'mean': np.mean(confidences),
'min': np.min(confidences),
'max': np.max(confidences),
'std': np.std(confidences)
}
return extracted_data
def visualize_results(self, image_path: str, results: Dict, show_bbox: bool = True):
"""Visualize OCR results with bounding boxes."""
image = cv2.imread(image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
plt.figure(figsize=(15, 10))
if show_bbox:
plt.subplot(2, 2, 1)
img_with_boxes = image_rgb.copy()
for item in results['filtered_results']:
bbox = np.array(item['bbox']).astype(int)
cv2.polylines(img_with_boxes, [bbox], True, (255, 0, 0), 2)
x, y = bbox[0]
cv2.putText(img_with_boxes, f"{item['confidence']:.2f}",
(x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
plt.imshow(img_with_boxes)
plt.title("OCR Results with Bounding Boxes")
plt.axis('off')
plt.subplot(2, 2, 2)
processed = self.preprocess_image(image)
plt.imshow(processed, cmap='gray')
plt.title("Preprocessed Image")
plt.axis('off')
plt.subplot(2, 2, 3)
confidences = [item['confidence'] for item in results['filtered_results']]
if confidences:
plt.hist(confidences, bins=20, alpha=0.7, color="blue")
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')
plt.title('Confidence Score Distribution')
plt.axvline(self.confidence_threshold, color="red", linestyle="--",
label=f'Threshold: {self.confidence_threshold}')
plt.legend()
plt.subplot(2, 2, 4)
stats = results['confidence_stats']
if stats:
labels = ['Mean', 'Min', 'Max']
values = [stats['mean'], stats['min'], stats['max']]
plt.bar(labels, values, color=['green', 'red', 'blue'])
plt.ylabel('Confidence Score')
plt.title('Confidence Statistics')
plt.ylim(0, 1)
plt.tight_layout()
plt.show()
def smart_text_analysis(self, text: str) -> Dict:
"""Perform intelligent analysis of extracted text."""
analysis = {
'language_detection': 'unknown',
'text_type': 'unknown',
'key_info': {},
'patterns': []
}
email_pattern = r'b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+.[A-Z|a-z]{2,}b'
phone_pattern = r'(+d{1,3}[-.s]?)?(?d{3})?[-.s]?d{3}[-.s]?d{4}'
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
date_pattern = r'bd{1,2}[/-]d{1,2}[/-]d{2,4}b'
patterns = {
'emails': re.findall(email_pattern, text, re.IGNORECASE),
'phones': re.findall(phone_pattern, text),
'urls': re.findall(url_pattern, text, re.IGNORECASE),
'dates': re.findall(date_pattern, text)
}
analysis['patterns'] = {k: v for k, v in patterns.items() if v}
if any(patterns.values()):
if patterns.get('emails') or patterns.get('phones'):
analysis['text_type'] = 'contact_info'
elif patterns.get('urls'):
analysis['text_type'] = 'web_content'
elif patterns.get('dates'):
analysis['text_type'] = 'document_with_dates'
if re.search(r'[а-яё]', text.lower()):
analysis['language_detection'] = 'russian'
elif re.search(r'[àáâãäåæçèéêëìíîïñòóôõöøùúûüý]', text.lower()):
analysis['language_detection'] = 'romance_language'
elif re.search(r'[一-龯]', text):
analysis['language_detection'] = 'chinese'
elif re.search(r'[ひらがなカタカナ]', text):
analysis['language_detection'] = 'japanese'
elif re.search(r'[a-zA-Z]', text):
analysis['language_detection'] = 'latin_based'
return analysis
def process_batch(self, image_folder: str) -> List[Dict]:
"""Process multiple images in batch."""
results = []
supported_formats = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff')
for filename in os.listdir(image_folder):
if filename.lower().endswith(supported_formats):
image_path = os.path.join(image_folder, filename)
try:
result = self.extract_text(image_path)
result['filename'] = filename
results.append(result)
print(f"✅ Processed: {filename}")
except Exception as e:
print(f"❌ Error processing {filename}: {str(e)}")
return results
def export_results(self, results: Dict, format: str="json") -> str:
"""Export results in specified format."""
if format.lower() == 'json':
output = json.dumps(results, indent=2, ensure_ascii=False)
filename="ocr_results.json"
elif format.lower() == 'txt':
output = results['full_text']
filename="extracted_text.txt"
else:
raise ValueError("Supported formats: 'json', 'txt'")
with open(filename, 'w', encoding='utf-8') as f:
f.write(output)
print(f"📄 Results exported to: {filename}")
return filename