Source code

001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030/**
031 * 
032 */
033package org.openimaj.image.text.extraction;
034
035import java.util.ArrayList;
036import java.util.HashMap;
037import java.util.Iterator;
038import java.util.List;
039import java.util.Map;
040
041import org.openimaj.citation.annotation.Reference;
042import org.openimaj.citation.annotation.ReferenceType;
043import org.openimaj.image.DisplayUtilities;
044import org.openimaj.image.FImage;
045import org.openimaj.image.MBFImage;
046import org.openimaj.image.connectedcomponent.ConnectedComponentLabeler;
047import org.openimaj.image.pixel.ConnectedComponent;
048import org.openimaj.image.pixel.ConnectedComponent.ConnectMode;
049import org.openimaj.image.pixel.Pixel;
050import org.openimaj.image.pixel.PixelSet;
051import org.openimaj.image.processing.convolution.CompassOperators.Compass0;
052import org.openimaj.image.processing.convolution.CompassOperators.Compass135;
053import org.openimaj.image.processing.convolution.CompassOperators.Compass45;
054import org.openimaj.image.processing.convolution.CompassOperators.Compass90;
055import org.openimaj.image.processing.convolution.FConvolution;
056import org.openimaj.image.processing.morphology.Close;
057import org.openimaj.image.processing.morphology.Dilate;
058import org.openimaj.image.processing.morphology.StructuringElement;
059import org.openimaj.image.processing.morphology.Thin;
060import org.openimaj.image.processing.threshold.OtsuThreshold;
061import org.openimaj.image.processing.transform.SkewCorrector;
062import org.openimaj.image.processor.connectedcomponent.ConnectedComponentProcessor;
063import org.openimaj.image.processor.connectedcomponent.render.OrientatedBoundingBoxRenderer;
064import org.openimaj.image.text.ocr.OCRProcessor;
065import org.openimaj.math.geometry.point.Point2d;
066import org.openimaj.math.geometry.point.Point2dImpl;
067import org.openimaj.math.geometry.shape.Polygon;
068import org.openimaj.math.geometry.shape.Rectangle;
069import org.openimaj.util.pair.IndependentPair;
070
071/**
072 *      A processor that attempts to extract text from an image. It uses a 3-stage
073 *      process: 1) find possible text regions; 2) filter then extract those 
074 *      regions; 3) OCR.
075 *      <p>
076 *      In the first stage it builds a feature map which is an image where the
077 *      pixel intensity is the likelihood of a pixel being within a text region.
078 *      It does this by a series of convolutions and morphological operations that
079 *      find regions that have short edges in multiple directions.
080 *      <p>
081 *      In the second stage, the regions are turned into blobs and those blobs
082 *      that are too small or inappropriately shaped are removed. The regions are
083 *      then extracted from the original image as subimages containing text. The
084 *      extracted subimages can have an expansion multipler applied to the box to
085 *      ensure that enough surrounding information is contained within the extracted
086 *      subimage for the OCR to work.  Use {@link #setBoundingBoxPaddingPc(float)}
087 *      with a multipler to expand the bounding boxes with; i.e. 1.05 will expand
088 *      the bounding box by 5%.
089 *      <p>
090 *      The third stage simply uses an {@link OCRProcessor} to process the subimages
091 *      and extract textual strings. Use the {@link #setOCRProcessor(OCRProcessor)} to set
092 *      the {@link OCRProcessor} to use to extract text. Note that by default no
093 *      processor is set. If the processor is executed without an {@link OCRProcessor}
094 *      being set, the OCR stage will not occur. This part of the implementation has
095 *      moved into {@link TextExtractor} super class.
096 *  <p>
097 *  The output of the processor can be retrieved using {@link #getTextRegions()}
098 *  which returns a map where the key is a bounding box of every detected
099 *  text region and the value is a pair of subimage to extracted text. 
100 *  <p>
101 *      From: [paper 01626635.pdf]
102 *      Xiaoqing Liu and Jagath Samarabandu; 
103 *      An Edge-based Text Region Extraction Algorithm for Indoor Mobile Robot 
104 *      Navigation, Proceedings of the IEEE International Conference on 
105 *      Mechatronics & Automation Niagara Falls, Canada, July 2005
106 *
107 *      @see "http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1626635"
108 *      @author David Dupplaw (dpd@ecs.soton.ac.uk)
109 *  @created 29 Jul 2011
110 *      
111 */
112@Reference(
113                type = ReferenceType.Inproceedings,
114                author = { "Xiaoqing Liu", "Samarabandu, J." },
115                title = "An edge-based text region extraction algorithm for indoor mobile robot navigation",
116                year = "2005",
117                booktitle = "Mechatronics and Automation, 2005 IEEE International Conference",
118                pages = { " 701 ", " 706 Vol. 2" },
119                month = "July-1 Aug.",
120                number = "",
121                volume = "2",
122                customData = { "keywords", "edge-based text region extraction; feature extraction; scene text; text localization; vision-based mobile robot navigation; character recognition; edge detection; feature extraction; mobile robots; navigation; path planning; robot vision;", "doi", "10.1109/ICMA.2005.1626635", "ISSN", "" }
123        )
124public class LiuSamarabanduTextExtractorBasic extends TextExtractor<FImage>
125{
126        /** Whether to debug the text extractor - displaying images as it goes */
127        public static final boolean DEBUG = false;
128        
129        /** Percentage of size to add around the bounding box of a text region */
130        private float boundingBoxPaddingPc = 1.1f;
131        
132        /** The output of the processor. Use #getTextRegions() */
133        private Map<Rectangle, FImage> textRegions = null;
134        
135        /**
136         *      Helper method that convolves the given image with the given
137         *      convolution operator. The method also performs an abs() and
138         *      a normalise(). We can take the absolute value as we're only
139         *      interested in whether edges are, for example, vertical and
140         *      not in what direction they are.
141         * 
142         *      @param img The image to convolve
143         *      @param c The convolution operator
144         *      @return A convolved image.
145         */
146        private FImage processImage( FImage img, FConvolution c )
147        {
148                return img.process( c ) 
149                        .abs()
150                        .normalise();
151        }
152
153        /**
154         *      {@inheritDoc}
155         *      @see org.openimaj.image.processor.ImageProcessor#processImage(org.openimaj.image.Image)
156         */
157        @Override
158        public void processImage( FImage image)
159        {
160                // Find which regions might be text
161                FImage fmap = textRegionDetection( image );
162                
163                // Process the feature map
164                processFeatureMap( fmap, image );
165                
166                // The output process image is the feature map
167                image.internalAssign( fmap );
168        }
169        
170        /**
171         *      Process a feature map. This function will side affect the field
172         *      <code>textRegions</code> in this class. Use {@link #getTextRegions()}
173         *      to retrieve the text regions extracted from this method.
174         * 
175         *      @param fmap The feature map to process
176         *      @param image The original image.
177         */
178        public void processFeatureMap( FImage fmap, FImage image )
179        {
180                // Extract the text regions from the image
181                Map<Rectangle, FImage> t = textRegionLocalisation( fmap, image );
182                this.textRegions = t;
183        }
184        
185        /**
186         *      Calculate the feature map that give the approximate localisation
187         *      of candidate text regions.
188         * 
189         *      @param image The image to process.
190         *      @return The feature map
191         */
192        public FImage textRegionDetection( FImage image )
193        {
194                // 1) Directional Filtering
195                HashMap<Integer,FImage> e = new HashMap<Integer, FImage>();
196                e.put(   0, processImage( image, new Compass0()   ) );
197                e.put(  45, processImage( image, new Compass45()  ) );
198                e.put(  90, processImage( image, new Compass90()  ) ); 
199                e.put( 135, processImage( image, new Compass135() ) );
200                        
201                // 2) Edge Selection
202                // 2a) Get strong edges
203                FImage e90strong = e.get( 90 ).process( new OtsuThreshold() );
204                
205                if( DEBUG )
206                        DisplayUtilities.display( e90strong, "Strong Edges" );
207                
208                // 2b) Get weak edges
209                // 2b)i) Dilate:
210                //       Use a horizontal 1x3 structuring element
211                StructuringElement se = new StructuringElement();
212                se.positive.add( new Pixel(0,0 ) );
213                se.positive.add( new Pixel(-1,0) );
214                se.positive.add( new Pixel(1,0) );
215                
216                if( DEBUG )
217                        System.out.println( "Dilating with a 1x3 structuring element" );
218                
219                FImage dilated = e90strong.process( new Dilate( se ) );
220                
221                // 2b)ii) Close:
222                //        Use a vertical mx1 structuring element 
223                int m = (int)(dilated.getHeight()/25d);
224                
225                if( DEBUG )
226                        System.out.println( "Closing with a "+m+"x1 structuring element.");
227                
228                StructuringElement se2 = new StructuringElement();
229                for( int i = 0; i < m; i++ )
230                        se2.positive.add( new Pixel(0,i-m/2) );
231                FImage closed = dilated.process( new Close( se2 ) );
232                
233                // 2b)iii) E90w = |E90 x (closed-dilated)|z
234                //         |.|z is the Otsu threshold operator
235                FImage e90weak = closed.subtract( dilated ).abs();
236                e90weak.multiplyInplace( e.get(90) );
237                e90weak = e90weak.process( new OtsuThreshold() );
238                
239                if( DEBUG )
240                        DisplayUtilities.display( e90weak, "Weak Edges" );
241                
242                FImage e90edges = e90strong.add( e90weak ).normalise().process( 
243                                new OtsuThreshold() );
244                
245                if( DEBUG )
246                        DisplayUtilities.display( e90edges, "Edges" );
247                
248                // 3a) Thin edges
249                FImage e90thin = e90edges.process( new Thin( StructuringElement.BOX ) );
250                
251                if( DEBUG )
252                        DisplayUtilities.display( e90thin, "Thinned" ); 
253                
254                ConnectedComponentLabeler ccl = new ConnectedComponentLabeler(
255                                ConnectMode.CONNECT_4);
256                List<ConnectedComponent> cc = ccl.findComponents( e90thin );
257                
258                // 4a) Label edges
259                final FImage e90labelled = new FImage( e90thin.getWidth(), e90thin.getHeight() );
260                ConnectedComponentProcessor ccp = new ConnectedComponentProcessor()
261                {
262                        @Override
263                        public void process( ConnectedComponent cc )
264                        {
265                                int a = cc.calculateArea();
266                                for( Pixel p : cc.pixels )
267                                        e90labelled.setPixel( (int)p.getX(), (int)p.getY(), (float)a );
268                        }
269                };
270                ConnectedComponent.process( cc, ccp );
271                
272                if( DEBUG ) {
273                        DisplayUtilities.display( e90labelled.clone().normalise(), "Labelled Edges" );
274                        System.out.println( "Max edge length: "+e90labelled.max() );
275                }
276
277                // Threshold the labelled edges to get only short ones
278                FImage e90short = e90labelled.clone().clip(0f,1f).subtract( 
279                                e90labelled.threshold( e90labelled.max()/4*3 ) );
280                
281                if( DEBUG )
282                        DisplayUtilities.display( e90short.clone().normalise(), "Thresholded Lengths" );
283                
284                // 5) Feature Map Generation
285                // Suppress false regions and enhance candidate regions
286                // 5a) candidate = Dilation( e90short )[mxm]
287                StructuringElement se3 = new StructuringElement();
288                for( int i = 0; i < m; i++ )
289                        for( int j = 0; j < m; j++ )
290                                se3.positive.add( new Pixel(i-m/2,j-m/2) );
291                FImage e90candidate = e90short.process( new Dilate( se3 ) );
292                
293                if( DEBUG )
294                        DisplayUtilities.display( e90candidate, "Candidate Regions" );
295                
296                // 5b) refined = candidate * sum( e0,e45,e90,e135 ); 
297                FImage is = e.get(0).clone().
298                        addInplace( e.get(45) ).
299                        addInplace( e.get(90) ).
300                        addInplace( e.get(135) );
301                
302                // We normalise the refined so that we can more simply
303                // normalise the pixel values in the feature map generation
304                // by the window size as we know each pixel is (0,1)
305                FImage refined = e90candidate.multiply( is ).normalise();
306                
307                if( DEBUG )
308                        DisplayUtilities.display( refined, "Refined" );
309                
310                // 5c) fmap(i,j) = N{W(i,j).sum[-c<m<c]( sum[-c<n<c]( refined(i+m,j+n) ) ) }
311                int c = 5;      // window size -- value not specified in paper 
312                FImage fmap = new FImage( image.getWidth(), image.getHeight() );
313
314                // Unlike the paper, we use the actual values of the edges
315                // in each of the direction images to calculate the weight
316                // for each pixel in the pixel map. So, instead of counting
317                // (which would require thresholding the direction images),
318                // we simply add the values of the maximum edges in each
319                // of the direction images; it means if there are 4 directions
320                // strongly represented in the window the weight will be near
321                // 4 (before normalisation); if there is only 1, the weight will
322                // be near 1. Anyway, we have to store the maximum for each direction
323                // image within the window which is what this hashmap is for,
324                // and what the updateMaxPixDir() private method is used for.
325                HashMap<Integer,Float> maxPixDir = new HashMap<Integer,Float>();
326                
327                // For every pixel in the feature map
328                for( int j = c; j < image.getHeight()-c; j++ )
329                {
330                        for( int i = c; i < image.getWidth()-c; i++ )
331                        {
332                                float pixelValue = 0;
333                                float N = c*c;
334                                maxPixDir.clear();
335                                
336                                // Look in the window
337                                for( m = -c; m < c; m++ )
338                                {
339                                        for( int n = -c; n < c; n++ )
340                                        {
341                                                pixelValue += refined.getPixel( i+m, j+n );
342                                                
343                                                updateMaxPixDir( maxPixDir, e, 0, i+m, j+n );
344                                                updateMaxPixDir( maxPixDir, e, 45, i+m, j+n );
345                                                updateMaxPixDir( maxPixDir, e, 90, i+m, j+n );
346                                                updateMaxPixDir( maxPixDir, e, 135, i+m, j+n );
347                                        }
348                                }
349                                
350                                float w = maxPixDir.get(0) + 
351                                                  maxPixDir.get(45) + 
352                                                  maxPixDir.get(90) +
353                                                  maxPixDir.get(135);
354                                w /= 4; // normalise the weight so it's between 0 and 1
355                                
356                                pixelValue *= w;
357                                pixelValue /= N;
358                                
359                                fmap.setPixel( i, j, pixelValue );
360                        }
361                }
362                
363                if( DEBUG )
364                        DisplayUtilities.display( fmap.clone().normalise(), "Feature Map" );
365                
366                return fmap;
367        }
368        
369        /**
370         *      Helper method to store the maximum pixel value for a given direction
371         *      at a given x and y coordinate.
372         * 
373         *      @param maxPixDir The hashmap in which the maxes are stored
374         *      @param e The hashmap of direction images
375         *      @param dir The direction to dereference
376         *      @param x the x-coord
377         *      @param y the y-coord
378         */
379        private void updateMaxPixDir( HashMap<Integer,Float> maxPixDir, HashMap<Integer,FImage> e, int dir, int x, int y )
380        {
381                Float xx = null;
382                if( (xx = maxPixDir.get(dir)) == null )
383                                maxPixDir.put(dir, e.get(dir).getPixel(x,y) );
384                else    maxPixDir.put(dir, Math.max(xx,e.get(dir).getPixel(x,y)));
385        }
386        
387        /**
388         *      Extract the regions that probably contain text (as given by the feature map)
389         * 
390         *      @param fmap The feature map calculated from {@link #textRegionDetection(FImage)}
391         *      @param image The original image
392         *      @return A map of boundingbox->images that area localised text regions
393         */
394        public Map<Rectangle,FImage> textRegionLocalisation( FImage fmap, FImage image )
395        {
396                // We'll store the localised text regions in this list
397                HashMap<Rectangle,FImage> textAreas = new HashMap<Rectangle, FImage>();
398                
399                // Threshold the image to find high probability regions
400                FImage thresh = fmap.clone().normalise().process( new OtsuThreshold() );
401                
402                // Dilate with a 7x7 structuring element
403                StructuringElement se = new StructuringElement();
404                int ses = 9;
405                for( int i = 0; i < ses; i++ )
406                        for( int j = 0; j < ses; j++ )
407                                se.positive.add( new Pixel(i,j) );
408                
409                FImage dilated = thresh.process( new Dilate( se ) );
410                
411                if( DEBUG )
412                        DisplayUtilities.display( dilated, "Candidate text-blobs" );
413                
414                // We use the connected-component labeller to extract the components.
415                ConnectedComponentLabeler ccl = new 
416                        ConnectedComponentLabeler(ConnectMode.CONNECT_4);
417                List<ConnectedComponent> ccs = ccl.findComponents( dilated );
418                
419                System.out.println( "Got "+ccs.size()+" connected components.");
420
421                // Now we can filter by iterating over the components
422                // Heuristics:
423                //    Area(region) >= (1/20).max
424                int maxArea = 0;
425                for( PixelSet cc : ccs )
426                        maxArea = Math.max( maxArea, cc.calculateArea() );
427                
428                //     - Remove regions that are too small
429                for( Iterator<ConnectedComponent> cci = ccs.iterator(); cci.hasNext(); )
430                        if( cci.next().calculateArea() < maxArea/20d )
431                                cci.remove();
432                
433                //    Ratio(w/h) = w/h >= 0.2
434                //              - Remove regions that aren't square enough.
435                for( Iterator<ConnectedComponent> cci = ccs.iterator(); cci.hasNext(); )
436                {
437                        PixelSet cc = cci.next();
438                        Rectangle r = cc.calculateRegularBoundingBox();
439                        if( r.width / r.height < 0.2 )
440                                cci.remove();
441                }
442
443                if( DEBUG ) {
444                        MBFImage bb = new MBFImage( image.getWidth(), image.getHeight(), 3 );
445                        bb.createRenderer().drawImage( image, 0, 0 );
446                        OrientatedBoundingBoxRenderer<Float[]> obbr = new 
447                                OrientatedBoundingBoxRenderer<Float[]>( bb, new Float[]{1f,1f,0f} );
448                        ConnectedComponent.process( ccs, obbr );
449                        DisplayUtilities.display( bb );
450                        System.out.println( "Continuing with "+ccs.size()+" connected components.");
451                }
452                
453                // Extract the text regions from the original image
454                for( PixelSet cc : ccs )
455                {
456                        if( cc.getPixels().size() < 20 ) continue;
457                        
458                        // Extract from the image the text region
459                        Rectangle r = cc.calculateRegularBoundingBox();
460                        r.scaleCentroid( boundingBoxPaddingPc );
461                        FImage textArea = image.extractROI( r );
462
463                        // Threshold of the image make it easier to extract MSERs
464                        OtsuThreshold o = new OtsuThreshold();
465                        o.processImage( textArea );
466                        
467                        if( DEBUG )
468                                DisplayUtilities.display( textArea, "text area - before distortion" );
469                        
470//                      // We distort the image to attempt to make the lettering straighter
471//                      // and more readable for the OCR        
472//                      // We work out the bounding box of the text to distort
473//                      Polygon p = cc.calculateOrientatedBoundingBox();
474//                      
475//                      // and the mapping to distort the text to a regular rectangle 
476//                      List<IndependentPair<Point2d,Point2d>> pointPairs = calculateHomography( p );
477//                      
478//                      // Calculate the homography matrix to distort the image.
479//                      Matrix homoMatrix = TransformUtilities.homographyMatrix( pointPairs );
480//                      
481//                      // Transform the image
482//                      textArea = textArea.transform( homoMatrix );
483                        
484                        SkewCorrector sc = new SkewCorrector();
485                        sc.setAccuracy( 4 );
486                        textArea = textArea.process( sc );
487                        
488                        // Store the detected text and the distorted image
489                        textAreas.put( r, textArea );
490                        
491                        if( DEBUG )
492                                DisplayUtilities.display( textArea, "text area - after distortion" );
493                }
494                
495                return textAreas;
496        }
497
498        /**
499         *      Calculates the point pairing for a given distorted polygon into 
500         *      orthogonal space.
501         * 
502         *  @param p The polygon with 4 points
503         *  @return A list of point pairs
504         */
505        public List<IndependentPair<Point2d,Point2d>> calculateHomography( Polygon p )
506        {
507                // Our output list
508                List<IndependentPair<Point2d,Point2d>> pointPairs = new 
509                        ArrayList<IndependentPair<Point2d,Point2d>>();
510
511                // Get the vertices
512                //
513                //  p1----------p4
514                //   |          |
515                //  p2----------p3
516                //
517                List<Point2d> v = p.getVertices();
518                Point2d p1 = v.get(0);
519                Point2d p2 = v.get(1);
520                Point2d p3 = v.get(2);
521                Point2d p4 = v.get(3);
522                                
523                // Mapped vertices
524                Point2d p1p = new Point2dImpl( p2.getX(), p1.getY() ); // vertical above p2
525                Point2d p2p = v.get(1); // don't move
526                Point2d p3p = new Point2dImpl( p3.getX(), p2.getY() ); // horizontal from p2
527                Point2d p4p = new Point2dImpl( p3p.getX(), p1.getY() ); // vertical above p3
528                
529                pointPairs.add( new IndependentPair<Point2d, Point2d>( p1, p1p ) );
530                pointPairs.add( new IndependentPair<Point2d, Point2d>( p2, p2p ) );
531                pointPairs.add( new IndependentPair<Point2d, Point2d>( p3, p3p ) );
532                pointPairs.add( new IndependentPair<Point2d, Point2d>( p4, p4p ) );
533                
534                return pointPairs;
535        }
536        
537        /**
538         *      Get the expansion value of the bounding boxes that are generated
539         *      for the text regions.
540         * 
541         *      @return the new bounding box expansion multiplier.
542         */
543        public float getBoundingBoxPaddingPc()
544        {
545                return boundingBoxPaddingPc;
546        }
547
548        /**
549         *      Set the expansion value for the subimage extraction.
550         *      @param boundingBoxPaddingPc the new multiplier
551         */
552        public void setBoundingBoxPaddingPc( float boundingBoxPaddingPc )
553        {
554                this.boundingBoxPaddingPc = boundingBoxPaddingPc;
555        }
556
557        /**
558         *      Returns a map of bounding box to image and textual string.
559         *      @return A map of image bounding box to subimage and text string.
560         */
561        @Override
562        public Map<Rectangle, FImage> getTextRegions()
563        {
564                return this.textRegions;
565        }
566}