001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030/** 031 * 032 */ 033package org.openimaj.image.text.extraction; 034 035import java.util.ArrayList; 036import java.util.HashMap; 037import java.util.Iterator; 038import java.util.List; 039import java.util.Map; 040 041import org.openimaj.citation.annotation.Reference; 042import org.openimaj.citation.annotation.ReferenceType; 043import org.openimaj.image.DisplayUtilities; 044import org.openimaj.image.FImage; 045import org.openimaj.image.MBFImage; 046import org.openimaj.image.connectedcomponent.ConnectedComponentLabeler; 047import org.openimaj.image.pixel.ConnectedComponent; 048import org.openimaj.image.pixel.ConnectedComponent.ConnectMode; 049import org.openimaj.image.pixel.Pixel; 050import org.openimaj.image.pixel.PixelSet; 051import org.openimaj.image.processing.convolution.CompassOperators.Compass0; 052import org.openimaj.image.processing.convolution.CompassOperators.Compass135; 053import org.openimaj.image.processing.convolution.CompassOperators.Compass45; 054import org.openimaj.image.processing.convolution.CompassOperators.Compass90; 055import org.openimaj.image.processing.convolution.FConvolution; 056import org.openimaj.image.processing.morphology.Close; 057import org.openimaj.image.processing.morphology.Dilate; 058import org.openimaj.image.processing.morphology.StructuringElement; 059import org.openimaj.image.processing.morphology.Thin; 060import org.openimaj.image.processing.threshold.OtsuThreshold; 061import org.openimaj.image.processing.transform.SkewCorrector; 062import org.openimaj.image.processor.connectedcomponent.ConnectedComponentProcessor; 063import org.openimaj.image.processor.connectedcomponent.render.OrientatedBoundingBoxRenderer; 064import org.openimaj.image.text.ocr.OCRProcessor; 065import org.openimaj.math.geometry.point.Point2d; 066import org.openimaj.math.geometry.point.Point2dImpl; 067import org.openimaj.math.geometry.shape.Polygon; 068import org.openimaj.math.geometry.shape.Rectangle; 069import org.openimaj.util.pair.IndependentPair; 070 071/** 072 * A processor that attempts to extract text from an image. It uses a 3-stage 073 * process: 1) find possible text regions; 2) filter then extract those 074 * regions; 3) OCR. 075 * <p> 076 * In the first stage it builds a feature map which is an image where the 077 * pixel intensity is the likelihood of a pixel being within a text region. 078 * It does this by a series of convolutions and morphological operations that 079 * find regions that have short edges in multiple directions. 080 * <p> 081 * In the second stage, the regions are turned into blobs and those blobs 082 * that are too small or inappropriately shaped are removed. The regions are 083 * then extracted from the original image as subimages containing text. The 084 * extracted subimages can have an expansion multipler applied to the box to 085 * ensure that enough surrounding information is contained within the extracted 086 * subimage for the OCR to work. Use {@link #setBoundingBoxPaddingPc(float)} 087 * with a multipler to expand the bounding boxes with; i.e. 1.05 will expand 088 * the bounding box by 5%. 089 * <p> 090 * The third stage simply uses an {@link OCRProcessor} to process the subimages 091 * and extract textual strings. Use the {@link #setOCRProcessor(OCRProcessor)} to set 092 * the {@link OCRProcessor} to use to extract text. Note that by default no 093 * processor is set. If the processor is executed without an {@link OCRProcessor} 094 * being set, the OCR stage will not occur. This part of the implementation has 095 * moved into {@link TextExtractor} super class. 096 * <p> 097 * The output of the processor can be retrieved using {@link #getTextRegions()} 098 * which returns a map where the key is a bounding box of every detected 099 * text region and the value is a pair of subimage to extracted text. 100 * <p> 101 * From: [paper 01626635.pdf] 102 * Xiaoqing Liu and Jagath Samarabandu; 103 * An Edge-based Text Region Extraction Algorithm for Indoor Mobile Robot 104 * Navigation, Proceedings of the IEEE International Conference on 105 * Mechatronics & Automation Niagara Falls, Canada, July 2005 106 * 107 * @see "http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1626635" 108 * @author David Dupplaw (dpd@ecs.soton.ac.uk) 109 * @created 29 Jul 2011 110 * 111 */ 112@Reference( 113 type = ReferenceType.Inproceedings, 114 author = { "Xiaoqing Liu", "Samarabandu, J." }, 115 title = "An edge-based text region extraction algorithm for indoor mobile robot navigation", 116 year = "2005", 117 booktitle = "Mechatronics and Automation, 2005 IEEE International Conference", 118 pages = { " 701 ", " 706 Vol. 2" }, 119 month = "July-1 Aug.", 120 number = "", 121 volume = "2", 122 customData = { "keywords", "edge-based text region extraction; feature extraction; scene text; text localization; vision-based mobile robot navigation; character recognition; edge detection; feature extraction; mobile robots; navigation; path planning; robot vision;", "doi", "10.1109/ICMA.2005.1626635", "ISSN", "" } 123 ) 124public class LiuSamarabanduTextExtractorBasic extends TextExtractor<FImage> 125{ 126 /** Whether to debug the text extractor - displaying images as it goes */ 127 public static final boolean DEBUG = false; 128 129 /** Percentage of size to add around the bounding box of a text region */ 130 private float boundingBoxPaddingPc = 1.1f; 131 132 /** The output of the processor. Use #getTextRegions() */ 133 private Map<Rectangle, FImage> textRegions = null; 134 135 /** 136 * Helper method that convolves the given image with the given 137 * convolution operator. The method also performs an abs() and 138 * a normalise(). We can take the absolute value as we're only 139 * interested in whether edges are, for example, vertical and 140 * not in what direction they are. 141 * 142 * @param img The image to convolve 143 * @param c The convolution operator 144 * @return A convolved image. 145 */ 146 private FImage processImage( FImage img, FConvolution c ) 147 { 148 return img.process( c ) 149 .abs() 150 .normalise(); 151 } 152 153 /** 154 * {@inheritDoc} 155 * @see org.openimaj.image.processor.ImageProcessor#processImage(org.openimaj.image.Image) 156 */ 157 @Override 158 public void processImage( FImage image) 159 { 160 // Find which regions might be text 161 FImage fmap = textRegionDetection( image ); 162 163 // Process the feature map 164 processFeatureMap( fmap, image ); 165 166 // The output process image is the feature map 167 image.internalAssign( fmap ); 168 } 169 170 /** 171 * Process a feature map. This function will side affect the field 172 * <code>textRegions</code> in this class. Use {@link #getTextRegions()} 173 * to retrieve the text regions extracted from this method. 174 * 175 * @param fmap The feature map to process 176 * @param image The original image. 177 */ 178 public void processFeatureMap( FImage fmap, FImage image ) 179 { 180 // Extract the text regions from the image 181 Map<Rectangle, FImage> t = textRegionLocalisation( fmap, image ); 182 this.textRegions = t; 183 } 184 185 /** 186 * Calculate the feature map that give the approximate localisation 187 * of candidate text regions. 188 * 189 * @param image The image to process. 190 * @return The feature map 191 */ 192 public FImage textRegionDetection( FImage image ) 193 { 194 // 1) Directional Filtering 195 HashMap<Integer,FImage> e = new HashMap<Integer, FImage>(); 196 e.put( 0, processImage( image, new Compass0() ) ); 197 e.put( 45, processImage( image, new Compass45() ) ); 198 e.put( 90, processImage( image, new Compass90() ) ); 199 e.put( 135, processImage( image, new Compass135() ) ); 200 201 // 2) Edge Selection 202 // 2a) Get strong edges 203 FImage e90strong = e.get( 90 ).process( new OtsuThreshold() ); 204 205 if( DEBUG ) 206 DisplayUtilities.display( e90strong, "Strong Edges" ); 207 208 // 2b) Get weak edges 209 // 2b)i) Dilate: 210 // Use a horizontal 1x3 structuring element 211 StructuringElement se = new StructuringElement(); 212 se.positive.add( new Pixel(0,0 ) ); 213 se.positive.add( new Pixel(-1,0) ); 214 se.positive.add( new Pixel(1,0) ); 215 216 if( DEBUG ) 217 System.out.println( "Dilating with a 1x3 structuring element" ); 218 219 FImage dilated = e90strong.process( new Dilate( se ) ); 220 221 // 2b)ii) Close: 222 // Use a vertical mx1 structuring element 223 int m = (int)(dilated.getHeight()/25d); 224 225 if( DEBUG ) 226 System.out.println( "Closing with a "+m+"x1 structuring element."); 227 228 StructuringElement se2 = new StructuringElement(); 229 for( int i = 0; i < m; i++ ) 230 se2.positive.add( new Pixel(0,i-m/2) ); 231 FImage closed = dilated.process( new Close( se2 ) ); 232 233 // 2b)iii) E90w = |E90 x (closed-dilated)|z 234 // |.|z is the Otsu threshold operator 235 FImage e90weak = closed.subtract( dilated ).abs(); 236 e90weak.multiplyInplace( e.get(90) ); 237 e90weak = e90weak.process( new OtsuThreshold() ); 238 239 if( DEBUG ) 240 DisplayUtilities.display( e90weak, "Weak Edges" ); 241 242 FImage e90edges = e90strong.add( e90weak ).normalise().process( 243 new OtsuThreshold() ); 244 245 if( DEBUG ) 246 DisplayUtilities.display( e90edges, "Edges" ); 247 248 // 3a) Thin edges 249 FImage e90thin = e90edges.process( new Thin( StructuringElement.BOX ) ); 250 251 if( DEBUG ) 252 DisplayUtilities.display( e90thin, "Thinned" ); 253 254 ConnectedComponentLabeler ccl = new ConnectedComponentLabeler( 255 ConnectMode.CONNECT_4); 256 List<ConnectedComponent> cc = ccl.findComponents( e90thin ); 257 258 // 4a) Label edges 259 final FImage e90labelled = new FImage( e90thin.getWidth(), e90thin.getHeight() ); 260 ConnectedComponentProcessor ccp = new ConnectedComponentProcessor() 261 { 262 @Override 263 public void process( ConnectedComponent cc ) 264 { 265 int a = cc.calculateArea(); 266 for( Pixel p : cc.pixels ) 267 e90labelled.setPixel( (int)p.getX(), (int)p.getY(), (float)a ); 268 } 269 }; 270 ConnectedComponent.process( cc, ccp ); 271 272 if( DEBUG ) { 273 DisplayUtilities.display( e90labelled.clone().normalise(), "Labelled Edges" ); 274 System.out.println( "Max edge length: "+e90labelled.max() ); 275 } 276 277 // Threshold the labelled edges to get only short ones 278 FImage e90short = e90labelled.clone().clip(0f,1f).subtract( 279 e90labelled.threshold( e90labelled.max()/4*3 ) ); 280 281 if( DEBUG ) 282 DisplayUtilities.display( e90short.clone().normalise(), "Thresholded Lengths" ); 283 284 // 5) Feature Map Generation 285 // Suppress false regions and enhance candidate regions 286 // 5a) candidate = Dilation( e90short )[mxm] 287 StructuringElement se3 = new StructuringElement(); 288 for( int i = 0; i < m; i++ ) 289 for( int j = 0; j < m; j++ ) 290 se3.positive.add( new Pixel(i-m/2,j-m/2) ); 291 FImage e90candidate = e90short.process( new Dilate( se3 ) ); 292 293 if( DEBUG ) 294 DisplayUtilities.display( e90candidate, "Candidate Regions" ); 295 296 // 5b) refined = candidate * sum( e0,e45,e90,e135 ); 297 FImage is = e.get(0).clone(). 298 addInplace( e.get(45) ). 299 addInplace( e.get(90) ). 300 addInplace( e.get(135) ); 301 302 // We normalise the refined so that we can more simply 303 // normalise the pixel values in the feature map generation 304 // by the window size as we know each pixel is (0,1) 305 FImage refined = e90candidate.multiply( is ).normalise(); 306 307 if( DEBUG ) 308 DisplayUtilities.display( refined, "Refined" ); 309 310 // 5c) fmap(i,j) = N{W(i,j).sum[-c<m<c]( sum[-c<n<c]( refined(i+m,j+n) ) ) } 311 int c = 5; // window size -- value not specified in paper 312 FImage fmap = new FImage( image.getWidth(), image.getHeight() ); 313 314 // Unlike the paper, we use the actual values of the edges 315 // in each of the direction images to calculate the weight 316 // for each pixel in the pixel map. So, instead of counting 317 // (which would require thresholding the direction images), 318 // we simply add the values of the maximum edges in each 319 // of the direction images; it means if there are 4 directions 320 // strongly represented in the window the weight will be near 321 // 4 (before normalisation); if there is only 1, the weight will 322 // be near 1. Anyway, we have to store the maximum for each direction 323 // image within the window which is what this hashmap is for, 324 // and what the updateMaxPixDir() private method is used for. 325 HashMap<Integer,Float> maxPixDir = new HashMap<Integer,Float>(); 326 327 // For every pixel in the feature map 328 for( int j = c; j < image.getHeight()-c; j++ ) 329 { 330 for( int i = c; i < image.getWidth()-c; i++ ) 331 { 332 float pixelValue = 0; 333 float N = c*c; 334 maxPixDir.clear(); 335 336 // Look in the window 337 for( m = -c; m < c; m++ ) 338 { 339 for( int n = -c; n < c; n++ ) 340 { 341 pixelValue += refined.getPixel( i+m, j+n ); 342 343 updateMaxPixDir( maxPixDir, e, 0, i+m, j+n ); 344 updateMaxPixDir( maxPixDir, e, 45, i+m, j+n ); 345 updateMaxPixDir( maxPixDir, e, 90, i+m, j+n ); 346 updateMaxPixDir( maxPixDir, e, 135, i+m, j+n ); 347 } 348 } 349 350 float w = maxPixDir.get(0) + 351 maxPixDir.get(45) + 352 maxPixDir.get(90) + 353 maxPixDir.get(135); 354 w /= 4; // normalise the weight so it's between 0 and 1 355 356 pixelValue *= w; 357 pixelValue /= N; 358 359 fmap.setPixel( i, j, pixelValue ); 360 } 361 } 362 363 if( DEBUG ) 364 DisplayUtilities.display( fmap.clone().normalise(), "Feature Map" ); 365 366 return fmap; 367 } 368 369 /** 370 * Helper method to store the maximum pixel value for a given direction 371 * at a given x and y coordinate. 372 * 373 * @param maxPixDir The hashmap in which the maxes are stored 374 * @param e The hashmap of direction images 375 * @param dir The direction to dereference 376 * @param x the x-coord 377 * @param y the y-coord 378 */ 379 private void updateMaxPixDir( HashMap<Integer,Float> maxPixDir, HashMap<Integer,FImage> e, int dir, int x, int y ) 380 { 381 Float xx = null; 382 if( (xx = maxPixDir.get(dir)) == null ) 383 maxPixDir.put(dir, e.get(dir).getPixel(x,y) ); 384 else maxPixDir.put(dir, Math.max(xx,e.get(dir).getPixel(x,y))); 385 } 386 387 /** 388 * Extract the regions that probably contain text (as given by the feature map) 389 * 390 * @param fmap The feature map calculated from {@link #textRegionDetection(FImage)} 391 * @param image The original image 392 * @return A map of boundingbox->images that area localised text regions 393 */ 394 public Map<Rectangle,FImage> textRegionLocalisation( FImage fmap, FImage image ) 395 { 396 // We'll store the localised text regions in this list 397 HashMap<Rectangle,FImage> textAreas = new HashMap<Rectangle, FImage>(); 398 399 // Threshold the image to find high probability regions 400 FImage thresh = fmap.clone().normalise().process( new OtsuThreshold() ); 401 402 // Dilate with a 7x7 structuring element 403 StructuringElement se = new StructuringElement(); 404 int ses = 9; 405 for( int i = 0; i < ses; i++ ) 406 for( int j = 0; j < ses; j++ ) 407 se.positive.add( new Pixel(i,j) ); 408 409 FImage dilated = thresh.process( new Dilate( se ) ); 410 411 if( DEBUG ) 412 DisplayUtilities.display( dilated, "Candidate text-blobs" ); 413 414 // We use the connected-component labeller to extract the components. 415 ConnectedComponentLabeler ccl = new 416 ConnectedComponentLabeler(ConnectMode.CONNECT_4); 417 List<ConnectedComponent> ccs = ccl.findComponents( dilated ); 418 419 System.out.println( "Got "+ccs.size()+" connected components."); 420 421 // Now we can filter by iterating over the components 422 // Heuristics: 423 // Area(region) >= (1/20).max 424 int maxArea = 0; 425 for( PixelSet cc : ccs ) 426 maxArea = Math.max( maxArea, cc.calculateArea() ); 427 428 // - Remove regions that are too small 429 for( Iterator<ConnectedComponent> cci = ccs.iterator(); cci.hasNext(); ) 430 if( cci.next().calculateArea() < maxArea/20d ) 431 cci.remove(); 432 433 // Ratio(w/h) = w/h >= 0.2 434 // - Remove regions that aren't square enough. 435 for( Iterator<ConnectedComponent> cci = ccs.iterator(); cci.hasNext(); ) 436 { 437 PixelSet cc = cci.next(); 438 Rectangle r = cc.calculateRegularBoundingBox(); 439 if( r.width / r.height < 0.2 ) 440 cci.remove(); 441 } 442 443 if( DEBUG ) { 444 MBFImage bb = new MBFImage( image.getWidth(), image.getHeight(), 3 ); 445 bb.createRenderer().drawImage( image, 0, 0 ); 446 OrientatedBoundingBoxRenderer<Float[]> obbr = new 447 OrientatedBoundingBoxRenderer<Float[]>( bb, new Float[]{1f,1f,0f} ); 448 ConnectedComponent.process( ccs, obbr ); 449 DisplayUtilities.display( bb ); 450 System.out.println( "Continuing with "+ccs.size()+" connected components."); 451 } 452 453 // Extract the text regions from the original image 454 for( PixelSet cc : ccs ) 455 { 456 if( cc.getPixels().size() < 20 ) continue; 457 458 // Extract from the image the text region 459 Rectangle r = cc.calculateRegularBoundingBox(); 460 r.scaleCentroid( boundingBoxPaddingPc ); 461 FImage textArea = image.extractROI( r ); 462 463 // Threshold of the image make it easier to extract MSERs 464 OtsuThreshold o = new OtsuThreshold(); 465 o.processImage( textArea ); 466 467 if( DEBUG ) 468 DisplayUtilities.display( textArea, "text area - before distortion" ); 469 470// // We distort the image to attempt to make the lettering straighter 471// // and more readable for the OCR 472// // We work out the bounding box of the text to distort 473// Polygon p = cc.calculateOrientatedBoundingBox(); 474// 475// // and the mapping to distort the text to a regular rectangle 476// List<IndependentPair<Point2d,Point2d>> pointPairs = calculateHomography( p ); 477// 478// // Calculate the homography matrix to distort the image. 479// Matrix homoMatrix = TransformUtilities.homographyMatrix( pointPairs ); 480// 481// // Transform the image 482// textArea = textArea.transform( homoMatrix ); 483 484 SkewCorrector sc = new SkewCorrector(); 485 sc.setAccuracy( 4 ); 486 textArea = textArea.process( sc ); 487 488 // Store the detected text and the distorted image 489 textAreas.put( r, textArea ); 490 491 if( DEBUG ) 492 DisplayUtilities.display( textArea, "text area - after distortion" ); 493 } 494 495 return textAreas; 496 } 497 498 /** 499 * Calculates the point pairing for a given distorted polygon into 500 * orthogonal space. 501 * 502 * @param p The polygon with 4 points 503 * @return A list of point pairs 504 */ 505 public List<IndependentPair<Point2d,Point2d>> calculateHomography( Polygon p ) 506 { 507 // Our output list 508 List<IndependentPair<Point2d,Point2d>> pointPairs = new 509 ArrayList<IndependentPair<Point2d,Point2d>>(); 510 511 // Get the vertices 512 // 513 // p1----------p4 514 // | | 515 // p2----------p3 516 // 517 List<Point2d> v = p.getVertices(); 518 Point2d p1 = v.get(0); 519 Point2d p2 = v.get(1); 520 Point2d p3 = v.get(2); 521 Point2d p4 = v.get(3); 522 523 // Mapped vertices 524 Point2d p1p = new Point2dImpl( p2.getX(), p1.getY() ); // vertical above p2 525 Point2d p2p = v.get(1); // don't move 526 Point2d p3p = new Point2dImpl( p3.getX(), p2.getY() ); // horizontal from p2 527 Point2d p4p = new Point2dImpl( p3p.getX(), p1.getY() ); // vertical above p3 528 529 pointPairs.add( new IndependentPair<Point2d, Point2d>( p1, p1p ) ); 530 pointPairs.add( new IndependentPair<Point2d, Point2d>( p2, p2p ) ); 531 pointPairs.add( new IndependentPair<Point2d, Point2d>( p3, p3p ) ); 532 pointPairs.add( new IndependentPair<Point2d, Point2d>( p4, p4p ) ); 533 534 return pointPairs; 535 } 536 537 /** 538 * Get the expansion value of the bounding boxes that are generated 539 * for the text regions. 540 * 541 * @return the new bounding box expansion multiplier. 542 */ 543 public float getBoundingBoxPaddingPc() 544 { 545 return boundingBoxPaddingPc; 546 } 547 548 /** 549 * Set the expansion value for the subimage extraction. 550 * @param boundingBoxPaddingPc the new multiplier 551 */ 552 public void setBoundingBoxPaddingPc( float boundingBoxPaddingPc ) 553 { 554 this.boundingBoxPaddingPc = boundingBoxPaddingPc; 555 } 556 557 /** 558 * Returns a map of bounding box to image and textual string. 559 * @return A map of image bounding box to subimage and text string. 560 */ 561 @Override 562 public Map<Rectangle, FImage> getTextRegions() 563 { 564 return this.textRegions; 565 } 566}