001/**
002 * Copyright 2010 The University of Southampton, Yahoo Inc., and the
003 * individual contributors. All rights reserved.
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *    http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.openimaj.web.readability;
018
019import java.io.IOException;
020import java.io.StringReader;
021import java.net.URL;
022import java.text.ParseException;
023import java.text.SimpleDateFormat;
024import java.util.ArrayList;
025import java.util.Date;
026import java.util.EnumSet;
027import java.util.List;
028import java.util.regex.Matcher;
029import java.util.regex.Pattern;
030
031import org.cyberneko.html.parsers.DOMFragmentParser;
032import org.cyberneko.html.parsers.DOMParser;
033import org.pojava.datetime.DateTime;
034import org.w3c.dom.DOMException;
035import org.w3c.dom.Document;
036import org.w3c.dom.DocumentFragment;
037import org.w3c.dom.Element;
038import org.w3c.dom.Node;
039import org.w3c.dom.NodeList;
040import org.w3c.dom.bootstrap.DOMImplementationRegistry;
041import org.w3c.dom.ls.DOMImplementationLS;
042import org.w3c.dom.ls.LSSerializer;
043import org.w3c.dom.traversal.DocumentTraversal;
044import org.w3c.dom.traversal.NodeFilter;
045import org.w3c.dom.traversal.TreeWalker;
046import org.xml.sax.InputSource;
047import org.xml.sax.SAXException;
048
049/**
050 * Class for extracting the "content" from web-pages, and ignoring adverts, etc. 
051 * Based upon readability.js (http://lab.arc90.com/experiments/readability/) and 
052 * modified to behave better for certain sites (and typically better mimic Safari 
053 * Reader functionality).
054 *  
055 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
056 * @author Michael Matthews (mikemat@yahoo-inc.com)
057 * @author David Dupplaw (dpd@ecs.soton.ac.uk)
058 */
059public class Readability
060{
061        /**
062         * Regular expressions for different types of content
063         */
064        protected static class Regexps {
065
066                public static String unlikelyCandidatesRe = "(?i)combx|comment|disqus|foot|header|menu|rss|shoutbox|sidebar|sponsor|story-feature|banner"; //caption?
067                public static String okMaybeItsACandidateRe = "(?i)and|comments|article|body|column|main";
068                public static String positiveRe = "(?i)article|body|comments|content|entry|hentry|page|pagination|post|text";
069                public static String negativeRe= "(?i)combx|comment|contact|foot|footer|footnote|link|masthead|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget|warning";
070                public static String divToPElementsRe = "(?i)(a|blockquote|dl|div|img|ol|p|pre|table|ul)";
071                public static String replaceBrsRe = "(?i)(<br[^>]*>[ \n\r\t]*){2,}";
072                public static String replaceFontsRe ="(?i)<(\\/?)font[^>]*>";
073                public static String trimRe = "^\\s+|\\s+$";
074                public static String normalizeRe = "\\s{2,}";
075                public static String killBreaksRe = "(<br\\s*\\/?>(\\s|&nbsp;?)*){1,}";
076                public static String videoRe = "(?i)http:\\/\\/(www\\.)?(youtube|vimeo)\\.com";
077
078                public static String titleSeparatorRe = "\\|\\-\\/";
079
080                //this is used to try and find elements that represent sub-headings (that are not h1..h6)
081                public static String likelySubheadCandidateRe = "(?i)cross-head";
082        }
083
084        enum Flag {
085                FLAG_STRIP_UNLIKELYS,
086                FLAG_WEIGHT_CLASSES
087        }
088
089        /**
090         * Threshold for removing elements with lots of links
091         */
092        public static float LINK_DENSITY_THRESHOLD = 0.33F; 
093
094
095        //IVARS below
096        protected Document document;
097        private Node bodyCache;
098        protected EnumSet<Flag> flags = EnumSet.allOf(Flag.class);
099
100        protected String articleTitle;
101        protected Element articleContent;
102        protected String article_date_string;
103        protected Date article_date;
104        protected String article_contentType;
105
106        protected boolean debug = false;
107
108        protected boolean addTitle = false;
109
110        /**
111         * Construct with the given document. Debugging is disabled. 
112         * @param document The document.
113         */
114        public Readability(Document document) {
115                this(document, false);
116        }
117
118        /**
119         * Construct with the given document. The second argument can be used to enable
120         * debugging output. 
121         * @param document The document.
122         * @param debug Enable debugging output.
123         */
124        public Readability(Document document, boolean debug) {
125                this(document, debug, false);
126        }
127
128        /**
129         * Construct with the given document. The second argument can be used to enable
130         * debugging output. The third option controls whether the title should be 
131         * included in the output.
132         * @param document The document.
133         * @param debug Enable debugging output.
134         * @param addTitle Add title to output.
135         */
136        public Readability(Document document, boolean debug, boolean addTitle) {
137                this.debug = debug;
138                this.document = document;
139                this.addTitle = addTitle;
140                augmentDocument(document);
141                init();
142        }
143
144        /**
145         * Iterates through all the ELEMENT nodes in a document
146         * and gives them ids if they don't already have them.
147         * 
148         * @param document
149         */
150        public static void augmentDocument(Document document) {
151                DocumentTraversal traversal = (DocumentTraversal) document;
152
153                TreeWalker walker = traversal.createTreeWalker(document, NodeFilter.SHOW_ELEMENT, null, true);
154
155                traverseLevel(walker, 0);
156        }
157
158        private static int traverseLevel(TreeWalker walker, int counter) {
159                // describe current node:
160                Node parend = walker.getCurrentNode();
161                
162                if (parend instanceof Element) {
163                        if (((Element)parend).getAttribute("id").length() == 0) {
164                                ((Element)parend).setAttribute("id", "gen-id-"+counter);
165                                counter++;
166                        }
167                }
168                
169                // traverse children:
170                for (Node n = walker.firstChild(); n != null; 
171                n = walker.nextSibling()) {
172                        counter = traverseLevel(walker, counter);
173                }
174
175                // return position to the current (level up):
176                walker.setCurrentNode(parend);
177                
178                return counter;
179        }
180
181        protected void dbg(String s) {
182                if (debug)
183                        System.err.println(s);
184        }
185
186        protected String getTitle() {
187                NodeList l = document.getElementsByTagName("title");
188
189                if (l.getLength() == 0) return "";
190
191                return l.item(0).getTextContent();
192        }
193
194        /**
195         * Javascript-like String.match
196         * @param input
197         * @param regex
198         * @return
199         */
200        protected String[] match(String input, String regex) {
201                Matcher matcher = Pattern.compile(regex).matcher(input);
202                List<String> matches = new ArrayList<String>();
203
204                while ( matcher.find() ) {
205                        matches.add(matcher.group(0));
206                }
207
208                return matches.toArray(new String[matches.size()]);
209        }
210
211        /**
212         * @return True if the article has any detected content; false otherwise.
213         */
214        public boolean hasContent() {
215                return articleContent != null;
216        }
217        
218        /**
219         * Javascript-like String.search
220         * @param input
221         * @param regex
222         * @return
223         */
224        protected int search(String input, String regex) {
225                Matcher matcher = Pattern.compile(regex).matcher(input);
226
227                if (!matcher.find()) return -1;
228                return matcher.start();
229        }
230
231
232        protected void findArticleEncoding() {
233                NodeList nl = document.getElementsByTagName("meta");
234                for (int j=0; j<nl.getLength(); j++) {
235                        if (((Element)nl.item(j)).getAttribute("http-equiv").equals("Content-Type")) {
236                                article_contentType = ((Element)nl.item(j)).getAttribute("content");
237                                return;
238                        }
239                }
240
241        }
242
243        protected void findArticleDate() {
244                //<meta name="OriginalPublicationDate" content="2010/07/12 14:08:02"/>
245                //<meta name="DC.date.issued" content="2010-07-12">
246                NodeList nl = document.getElementsByTagName("meta");
247                for (int j=0; j<nl.getLength(); j++) {
248                        if (((Element)nl.item(j)).getAttribute("name").equals("OriginalPublicationDate")) {
249                                article_date_string = ((Element)nl.item(j)).getAttribute("content");
250                                article_date = DateTime.parse(article_date_string).toDate();
251                                return;
252                        }
253                        if (((Element)nl.item(j)).getAttribute("name").equals("DC.date.issued")) {
254                                article_date_string = ((Element)nl.item(j)).getAttribute("content");
255                                article_date = DateTime.parse(article_date_string).toDate();
256                                return;
257                        }
258                }
259
260                //<time datetime="2010-07-12T10:26BST" pubdate>Monday 12 July 2010 10.26 BST</time>
261                nl = document.getElementsByTagName("time");
262                for (int j=0; j<nl.getLength(); j++) {
263                        if (((Element)nl.item(j)).getAttributeNode("pubdate") != null) {
264                                article_date_string = ((Element)nl.item(j)).getAttribute("datetime");
265                                article_date = DateTime.parse(article_date_string).toDate();
266                                return;
267                        }
268                }
269
270                //<span class="date">14:08 GMT, Monday, 12 July 2010 15:08 UK</span>
271                //<p class="date">09.07.2010 @ 17:49 CET</p>
272                //<p class="date">Today @ 09:29 CET</p>
273                nl = document.getElementsByTagName("*");
274                for (int j=0; j<nl.getLength(); j++) {
275                        if ((((Element)nl.item(j)).getAttribute("class").contains("date") ||
276                                        ((Element)nl.item(j)).getAttribute("class").contains("Date") ) && 
277                                        !(((Element)nl.item(j)).getAttribute("class").contains("update") ||
278                                                        ((Element)nl.item(j)).getAttribute("class").contains("Update"))
279                        ) {
280                                article_date_string = getInnerTextSep((Element)nl.item(j)).trim();
281                                parseDate();
282                                return;
283                        }
284                }
285                for (int j=0; j<nl.getLength(); j++) {
286                        if ((((Element)nl.item(j)).getAttribute("id").contains("date") ||
287                                        ((Element)nl.item(j)).getAttribute("id").contains("Date") ) && 
288                                        !(((Element)nl.item(j)).getAttribute("id").contains("update") ||
289                                                        ((Element)nl.item(j)).getAttribute("id").contains("Update"))
290                        ) {
291                                article_date_string = getInnerTextSep((Element)nl.item(j)).trim();
292                                parseDate();
293                                return;
294                        }
295                }
296
297                //Last updated at 3:05 PM on 12th July 2010
298                nl = document.getElementsByTagName("*");
299                for (int j=0; j<nl.getLength(); j++) {
300                        String text = nl.item(j).getTextContent();
301
302                        if (text == null)
303                                continue;
304
305                        Pattern p = Pattern.compile("Last updated at (\\d+:\\d\\d [AP]M on \\d+[thsndr]+ \\w+ \\d\\d\\d\\d)");
306                        Matcher m = p.matcher(text);
307                        if (m.find()) {
308                                article_date_string = m.group(1);
309
310                                String cpy = article_date_string.replaceAll("th", "");
311                                cpy = cpy.replaceAll("st", "");
312                                cpy = cpy.replaceAll("nd", "");
313                                cpy = cpy.replaceAll("rd", "");
314
315                                SimpleDateFormat sdf = new SimpleDateFormat("h:mm a 'on' dd MMMM yyyy");
316                                try { article_date = sdf.parse(cpy); } catch (ParseException e) {}
317                                return;
318                        }
319                }
320        }
321
322        @SuppressWarnings("deprecation")
323        protected void parseDate() {
324                if (article_date_string == null || article_date_string.trim().isEmpty() ) return;
325
326                if (article_date_string.contains("Today")) {
327                        try {
328                                SimpleDateFormat sdf = new SimpleDateFormat("'Today @' HH:mm z");
329                                article_date = sdf.parse(article_date_string);
330                                Date now = new Date();
331                                article_date.setDate(now.getDate());
332                                article_date.setMonth(now.getMonth());
333                                article_date.setYear(now.getYear());
334                        } catch (ParseException e) {}
335                } else {
336                        try { 
337                                SimpleDateFormat sdf = new SimpleDateFormat("h:mm z',' E',' dd M yyyy");
338                                article_date = sdf.parse(article_date_string); 
339                        } catch (ParseException e) {
340                                try {
341                                        SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy '@' HH:mm z");
342                                        article_date = sdf.parse(article_date_string);
343                                } catch (ParseException ee) {
344                                        try {
345                                                SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy");
346                                                article_date = sdf.parse(article_date_string);
347                                        } catch (ParseException eee) {
348                                                try {
349                                                        article_date = DateTime.parse(article_date_string).toDate();
350                                                } catch (IllegalArgumentException ie) {
351                                                } catch (java.lang.ArrayIndexOutOfBoundsException ie) {
352                                                        System.out.println(article_date_string);
353                                                }                                                       
354                                        }       
355                                }
356                        }
357                }
358        }
359
360        /**
361         * Get the article title.
362         *
363         * @return void
364         **/
365        protected String findArticleTitle() {
366                String curTitle = "", origTitle = "";
367
368                curTitle = origTitle = getTitle();
369
370                //
371                List<String> potentialTitles = new ArrayList<String>();
372                for (int i=1; i<=6; i++) {
373                        NodeList nl = document.getElementsByTagName("h"+i);
374                        if (nl.getLength() > 0) {
375                                for (int j=0; j<nl.getLength(); j++)
376                                        potentialTitles.add(nl.item(j).getTextContent().trim());
377                        }
378                }
379
380                String potentialTitle = null;
381                int score = 0;
382                for (String s : potentialTitles) {
383                        if (s.length()>score && curTitle.contains(s)) {
384                                potentialTitle = s;
385                                score = s.length();
386                        }
387                }
388                if (potentialTitle != null) return potentialTitle;
389                //
390
391                if(match(curTitle, " ["+Regexps.titleSeparatorRe+"]+ ").length > 0)
392                {
393                        curTitle = origTitle.replaceAll("(.*) ["+Regexps.titleSeparatorRe+"]+ .*", "$1");
394
395                        if(curTitle.split(" ").length < 3) {
396                                curTitle = origTitle.replaceAll("(?i)[^"+Regexps.titleSeparatorRe+"]*["+Regexps.titleSeparatorRe+"]+(.*)", "$1");
397                        }
398                }
399                else if(curTitle.indexOf(": ") != -1)
400                {
401                        curTitle = origTitle.replaceAll("(?i).*:(.*)", "$1");
402
403                        if(curTitle.split(" ").length < 3) {
404                                curTitle = origTitle.replaceAll("(?i)[^:]*[:](.*)", "$1");
405                        }
406                }
407                else if(curTitle.length() > 150 || curTitle.length() < 15)
408                {
409                        NodeList hOnes = document.getElementsByTagName("h1");
410                        if(hOnes.getLength() == 1)
411                        {
412                                curTitle = getInnerText((Element) hOnes.item(0));
413                        }
414                }
415
416                curTitle = curTitle.replaceAll( Regexps.trimRe, "" );
417
418                if(curTitle.split(" ").length <= 3) {
419                        curTitle = origTitle;
420                }
421
422                return curTitle;
423        }       
424
425        /**
426         * Equivalent to document.body in JS
427         * @return
428         */
429        protected Element getBody() {
430                NodeList nl = document.getElementsByTagName("body");
431
432                if (nl.getLength() == 0) 
433                        return null;
434                else 
435                        return (Element) nl.item(0);
436        }
437
438        /**
439         * Runs readability.
440         * 
441         * Workflow:
442         *  1. Prep the document by removing script tags, css, etc.
443         *  2. Build readability"s DOM tree.
444         *  3. Grab the article content from the current dom tree.
445         *  4. Replace the current DOM tree with the new one.
446         *  5. Read peacefully.
447         *
448         **/
449        protected void init() {
450                if(getBody() != null && bodyCache == null) {
451                        bodyCache = getBody().cloneNode(true); }
452
453                findArticleDate(); //must be done before prepDocument() 
454
455                findArticleEncoding();
456
457                prepDocument();
458
459                /* Build readability"s DOM tree */
460                articleTitle = findArticleTitle();
461                articleContent = grabArticle();
462
463                /**
464                 * If we attempted to strip unlikely candidates on the first run through, and we ended up with no content,
465                 * that may mean we stripped out the actual content so we couldn"t parse it. So re-run init while preserving
466                 * unlikely candidates to have a better shot at getting our content out properly.
467                 **/
468                if(getInnerText(articleContent, false).length() < 250)
469                {
470                        if (flags.contains(Flag.FLAG_STRIP_UNLIKELYS)) {
471                                flags.remove(Flag.FLAG_STRIP_UNLIKELYS);
472                                getBody().getParentNode().replaceChild(bodyCache, getBody());
473                                init();
474                                return;
475                        }
476                        else if (flags.contains(Flag.FLAG_WEIGHT_CLASSES)) {
477                                flags.remove(Flag.FLAG_WEIGHT_CLASSES);
478                                getBody().getParentNode().replaceChild(bodyCache, getBody());
479                                init();
480                                return; 
481                        }
482                        else {
483                                articleContent = null;
484                        }
485                }
486
487                if (addTitle && articleContent != null) {
488                        Element titleNode = document.createElement("h1");
489                        titleNode.setAttribute("id", "title");
490                        titleNode.appendChild(document.createTextNode(getArticleTitle()));
491                        articleContent.insertBefore(titleNode, articleContent.getFirstChild());
492                }
493        }
494
495        /**
496         * Prepare the HTML document for readability to scrape it.
497         * This includes things like stripping javascript, CSS, and handling terrible markup.
498         * 
499         **/
500        protected void prepDocument() {
501                /**
502                 * In some cases a body element can"t be found (if the HTML is totally hosed for example)
503                 * so we create a new body node and append it to the document.
504                 */
505                if(getBody() == null)
506                {
507                        Node body = document.createElement("body");
508                        document.appendChild(body);
509                }
510
511                //frames are not supported in this version!
512                //        NodeList frames = document.getElementsByTagName("frame");
513                //        if(frames.length > 0)
514                //        {
515                //            Node bestFrame = null;
516                //            int bestFrameSize = 0;
517                //            for(int frameIndex = 0; frameIndex < frames.getLength(); frameIndex++)
518                //            {
519                //                int frameSize = frames.item(frameIndex).offsetWidth + frames[frameIndex].offsetHeight;
520                //                var canAccessFrame = false;
521                //                try {
522                //                    frames[frameIndex].contentWindow.document.body;
523                //                    canAccessFrame = true;
524                //                }
525                //                catch(eFrames) {
526                //                    dbg(eFrames);
527                //                }
528                //                
529                //                if(canAccessFrame && frameSize > bestFrameSize)
530                //                {
531                //                    bestFrame = frames[frameIndex];
532                //                    bestFrameSize = frameSize;
533                //                }
534                //            }
535                //
536                //            if(bestFrame)
537                //            {
538                //                var newBody = document.createElement("body");
539                //                newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML;
540                //                newBody.style.overflow = "scroll";
541                //                document.body = newBody;
542                //                
543                //                var frameset = document.getElementsByTagName("frameset")[0];
544                //                if(frameset) {
545                //                    frameset.parentNode.removeChild(frameset); }
546                //                    
547                //                readability.frameHack = true;
548                //            }
549                //        }
550
551                /* remove all scripts that are not readability */
552                NodeList scripts = document.getElementsByTagName("script");
553                for(int i = scripts.getLength()-1; i >= 0; i--)
554                {
555                        scripts.item(i).getParentNode().removeChild(scripts.item(i));          
556                }
557
558                /* Remove all style tags in head */
559                NodeList styleTags = document.getElementsByTagName("style");
560                for (int st=0;st < styleTags.getLength(); st++) {
561                        styleTags.item(st).getParentNode().removeChild(styleTags.item(st));
562                }
563
564                /* Remove all meta tags  */
565                NodeList metaTags = document.getElementsByTagName("meta");
566                for (int mt=0;mt < metaTags.getLength(); mt++) {
567                        metaTags.item(mt).getParentNode().removeChild(metaTags.item(mt));
568                }
569
570                /* Turn all double br's into p's */
571                /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
572                //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrsRe, '</p><p>').replace(readability.regexps.replaceFontsRe, '<$1span>');
573                Element body = getBody();
574                //              Node rep = stringToNode(nodeToString(body).replaceAll(Regexps.replaceBrsRe, "</P><P>").replaceAll(Regexps.replaceFontsRe, "<$1span>"));
575                //              body.getParentNode().replaceChild(rep, body);
576
577                //This is slow!
578                Node frag = stringToNode(getInnerHTML(body).replaceAll(Regexps.replaceBrsRe, "</P><P>").replaceAll(Regexps.replaceFontsRe, "<$1span>"));
579                removeChildren(body);
580                body.appendChild(frag);
581
582                /* Remove all comments */
583                removeComments(document);
584        }
585
586        protected void removeComments(Node n) {
587                if (n.getNodeType() == Node.COMMENT_NODE) {
588                        n.getParentNode().removeChild(n);
589                } else {
590                        NodeList nl = n.getChildNodes();
591                        for (int i=0; i<nl.getLength(); i++) 
592                                removeComments(nl.item(i));
593                }
594        }
595
596        /**
597         * Prepare the article node for display. Clean out any inline styles,
598         * iframes, forms, strip extraneous <p> tags, etc.
599         *
600         * @param Element
601         **/
602        protected void prepArticle(Element articleContent) {
603                cleanStyles(articleContent);
604                killBreaks(articleContent);
605
606                /* Clean out junk from the article content */
607                clean(articleContent, "form");
608                clean(articleContent, "object");
609                clean(articleContent, "h1");
610                /**
611                 * If there is only one h2, they are probably using it
612                 * as a header and not a subheader, so remove it since we already have a header.
613                 ***/
614                if(articleContent.getElementsByTagName("h2").getLength() == 1) {
615                        clean(articleContent, "h2"); 
616                }
617                clean(articleContent, "iframe");
618
619                cleanHeaders(articleContent);
620
621                /* Do these last as the previous stuff may have removed junk that will affect these */
622                cleanConditionally(articleContent, "table");
623                cleanConditionally(articleContent, "ul");
624                cleanConditionally(articleContent, "div");
625
626                /* Remove extra paragraphs */
627                NodeList articleParagraphs = articleContent.getElementsByTagName("p");
628                for(int i = articleParagraphs.getLength()-1; i >= 0; i--)
629                {
630                        int imgCount    = ((Element) articleParagraphs.item(i)).getElementsByTagName("img").getLength();
631                        int embedCount  = ((Element) articleParagraphs.item(i)).getElementsByTagName("embed").getLength();
632                        int objectCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("object").getLength();
633
634                        if(imgCount == 0 && embedCount == 0 && objectCount == 0 && getInnerText((Element) articleParagraphs.item(i), false) == "")
635                        {
636                                articleParagraphs.item(i).getParentNode().removeChild(articleParagraphs.item(i));
637                        }
638                }
639
640                //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, "<p");
641                Node n = stringToNode(getInnerHTML(articleContent).replaceAll("(?i)<br[^>]*>\\s*<p", "<P"));
642                removeChildren(articleContent);
643                articleContent.appendChild(n);
644
645                //now remove empty p's and tidy up
646                NodeList nl = articleContent.getElementsByTagName("p");
647                for (int i=nl.getLength()-1; i>=0; i--) {
648                        if (nl.item(i).getTextContent().trim().length() == 0) 
649                        {
650                                nl.item(i).getParentNode().removeChild(nl.item(i));
651                        } else if (nl.item(i).getChildNodes().getLength() == 1 && nl.item(i).getChildNodes().item(0).getNodeType() == Node.TEXT_NODE) {
652                                nl.item(i).setTextContent("\n" + nl.item(i).getTextContent().trim() + "\n");
653                        }
654                        else if (((Element) nl.item(i)).getAttribute("class").equals("readability-styled")) 
655                        {
656                                nl.item(i).getParentNode().replaceChild(document.createTextNode(nl.item(i).getTextContent()), nl.item(i));
657                        }
658                }
659
660        }
661
662        protected void removeChildren(Node n) {
663                NodeList nl = n.getChildNodes();
664                int nn = nl.getLength();
665                for (int i=0; i<nn; i++)
666                        n.removeChild(nl.item(0));
667        }
668
669        /**
670         * Initialize a node with the readability object. Also checks the
671         * className/id for special names to add to its score.
672         *
673         * @param Element
674         **/
675        protected void initializeNode(Element node) {
676                float contentScore = 0;         
677
678                if (node.getTagName() == "DIV") {
679                        contentScore += 5;
680                } else if (node.getTagName() == "PRE" || node.getTagName() == "TD" || node.getTagName() == "BLOCKQUOTE") {
681                        contentScore += 3;
682                } else if (node.getTagName() == "ADDRESS" || node.getTagName() == "OL" || node.getTagName() == "UL"
683                        || node.getTagName() == "DL" || node.getTagName() == "DD" || node.getTagName() == "DT"
684                                || node.getTagName() == "LI" || node.getTagName() == "FORM") {
685                        contentScore -= 3;
686                } else if (node.getTagName() == "H1" || node.getTagName() == "H2" || node.getTagName() == "H3"
687                        || node.getTagName() == "H4" || node.getTagName() == "H5" || node.getTagName() == "H6"
688                                || node.getTagName() == "TH") {
689                        contentScore -= 5;
690                }
691
692                contentScore += getClassWeight(node);
693                node.setUserData("readability", contentScore, null);
694        }
695
696        /**
697         * Get an elements class/id weight. Uses regular expressions to tell if this 
698         * element looks good or bad.
699         *
700         * @param Element
701         * @return number (Integer)
702         **/
703        protected int getClassWeight(Element e) {
704                if (!flags.contains(Flag.FLAG_WEIGHT_CLASSES)) {
705                        return 0;
706                }
707
708                int weight = 0;
709
710                /* Look for a special classname */
711                if (e.getAttribute("class") != "")
712                {
713                        if(search(e.getAttribute("class"), Regexps.negativeRe) != -1) {
714                                weight -= 25;
715                        }
716
717                        if(search(e.getAttribute("class"), Regexps.positiveRe) != -1) {
718                                weight += 25;
719                        }
720                }
721
722                /* Look for a special ID */
723                if (e.getAttribute("id") != "")
724                {
725                        if(search(e.getAttribute("id"), Regexps.negativeRe) != -1) {
726                                weight -= 25;
727                        }
728
729                        if(search(e.getAttribute("id"), Regexps.positiveRe) != -1) {
730                                weight += 25;
731                        }
732                }
733
734                return weight;
735        }
736
737        protected void cleanStyles() {
738                cleanStyles((Element) document);
739        }
740
741        /**
742         * Remove the style attribute on every e and under.
743         * TODO: Test if getElementsByTagName(*) is faster.
744         *
745         * @param Element
746         **/
747        protected void cleanStyles(Element e) {
748                if(e == null) return; 
749                Node cur = e.getFirstChild();
750
751                // Remove any root styles, if we"re able.
752                if (!e.getAttribute("class").equals("readability-styled"))
753                        e.removeAttribute("style");
754
755                // Go until there are no more child nodes
756                while ( cur != null ) {
757                        if ( cur.getNodeType() == Element.ELEMENT_NODE) {
758                                // Remove style attribute(s) :
759                                if(!((Element) cur).getAttribute("class").equals("readability-styled")) {
760                                        ((Element) cur).removeAttribute("style");
761                                }
762                                cleanStyles( (Element) cur );
763                        }
764                        cur = cur.getNextSibling();
765                }  
766        }
767
768        /**
769         * Remove extraneous break tags from a node.
770         *
771         * @param Element
772         **/
773        protected void killBreaks(Element e) {
774                //e.innerHTML = e.innerHTML.replace(readability.regexps.killBreaksRe,"<br />");       
775
776                Node n = stringToNode(getInnerHTML(e).replaceAll(Regexps.killBreaksRe,"<BR />"));
777                removeChildren(e);
778                e.appendChild(n);
779        }
780
781        /**
782         * Clean a node of all elements of type "tag".
783         * (Unless it"s a youtube/vimeo video. People love movies.)
784         *
785         * @param Element
786         * @param string tag to clean
787         **/
788        protected void clean(Element e, String tag) {
789                NodeList targetList = e.getElementsByTagName( tag );
790                boolean isEmbed    = (tag.equals("object") || tag.equals("embed"));
791
792                for (int y=targetList.getLength()-1; y >= 0; y--) {
793                        /* Allow youtube and vimeo videos through as people usually want to see those. */
794                        if(isEmbed) {
795                                String attributeValues = "";
796                                for (int i=0, il=targetList.item(y).getAttributes().getLength(); i < il; i++) {
797                                        attributeValues += targetList.item(y).getAttributes().item(i).getNodeValue() + "|";
798                                }
799
800                                /* First, check the elements attributes to see if any of them contain youtube or vimeo */
801                                if (search(attributeValues, Regexps.videoRe) != -1) {
802                                        continue;
803                                }
804
805                                /* Then check the elements inside this element for the same. */
806                                if (search(getInnerHTML(targetList.item(y)), Regexps.videoRe) != -1) {
807                                        continue;
808                                }
809                        }
810
811                        targetList.item(y).getParentNode().removeChild(targetList.item(y));
812                }
813        }
814
815        /**
816         * Clean out spurious headers from an Element. Checks things like classnames and link density.
817         *
818         * @param Element
819         **/
820        protected void cleanHeaders(Element e) {
821                for (int headerIndex = 1; headerIndex < 7; headerIndex++) {
822                        NodeList headers = e.getElementsByTagName("h" + headerIndex);
823                        for (int i=headers.getLength()-1; i >=0; i--) {
824                                if (getClassWeight((Element) headers.item(i)) < 0 || getLinkDensity((Element) headers.item(i)) > LINK_DENSITY_THRESHOLD) {
825                                        headers.item(i).getParentNode().removeChild(headers.item(i));
826                                }
827                        }
828                }
829        }
830
831        /**
832         * Get the density of links as a percentage of the content
833         * This is the amount of text that is inside a link divided by the total text in the node.
834         * 
835         * @param Element
836         * @return number (float)
837         **/
838        protected float getLinkDensity(Element e) {
839                NodeList links = e.getElementsByTagName("a");
840                int textLength = getInnerText(e).length();
841                int linkLength = 0;
842
843                for(int i=0, il=links.getLength(); i<il;i++)
844                {
845                        linkLength += getInnerText((Element) links.item(i)).length();
846                }
847
848                if (linkLength == 0) return 0;
849
850                return (float)linkLength / (float)textLength;
851        }
852
853        /**
854         * Clean an element of all tags of type "tag" if they look fishy.
855         * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
856         **/
857        protected void cleanConditionally(Element e, String tag) {
858                NodeList tagsList = e.getElementsByTagName(tag);
859                int curTagsLength = tagsList.getLength();
860
861                /**
862                 * Gather counts for other typical elements embedded within.
863                 * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
864                 *
865                 * Todo: Consider taking into account original contentScore here.
866                 **/
867                for (int i=curTagsLength-1; i >= 0; i--) {
868                        int weight = getClassWeight((Element) tagsList.item(i));
869                        float contentScore = (tagsList.item(i).getUserData("readability") != null) ? (Float)(tagsList.item(i).getUserData("readability")) : 0;
870
871                        dbg("Cleaning Conditionally " + tagsList.item(i) + " (" + ((Element) tagsList.item(i)).getAttribute("class")+ ":" + ((Element)tagsList.item(i)).getAttribute("id") + ")" + ((tagsList.item(i).getUserData("readability") != null) ? (" with score " + tagsList.item(i).getUserData("readability")) : ""));
872
873                        if(weight+contentScore < 0)
874                        {
875                                dbg("Removing " + tagsList.item(i) + " (" + ((Element) tagsList.item(i)).getAttribute("class")+ ":" + ((Element)tagsList.item(i)).getAttribute("id") + ")");
876                                tagsList.item(i).getParentNode().removeChild(tagsList.item(i));
877                        }
878                        else if ( getCharCount((Element) tagsList.item(i), ",") < 10) {
879                                /**
880                                 * If there are not very many commas, and the number of
881                                 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
882                                 **/
883                                int p      = ((Element) tagsList.item(i)).getElementsByTagName("p").getLength();
884                                int img    = ((Element) tagsList.item(i)).getElementsByTagName("img").getLength();
885                                int li     = ((Element) tagsList.item(i)).getElementsByTagName("li").getLength()-100;
886                                int input  = ((Element) tagsList.item(i)).getElementsByTagName("input").getLength();
887
888                                int embedCount = 0;
889                                NodeList embeds = ((Element) tagsList.item(i)).getElementsByTagName("embed");
890                                for(int ei=0,il=embeds.getLength(); ei < il; ei++) {
891                                        if (search(((Element)embeds.item(ei)).getAttribute("src"), Regexps.videoRe) == -1) {
892                                                embedCount++; 
893                                        }
894                                }
895
896                                float linkDensity = getLinkDensity((Element) tagsList.item(i));
897                                int contentLength = getInnerText((Element) tagsList.item(i)).length();
898                                boolean toRemove = false;
899
900                                if ( img > p ) {
901                                        toRemove = true;
902                                } else if(li > p && tag != "ul" && tag != "ol") {
903                                        toRemove = true;
904                                } else if( input > Math.floor(p/3) ) {
905                                        toRemove = true; 
906                                } else if(contentLength < 25 && (img == 0 || img > 2) ) {
907                                        toRemove = true;
908                                } else if(weight < 25 && linkDensity > 0.2) {
909                                        toRemove = true;
910                                } else if(weight >= 25 && linkDensity > 0.5) {
911                                        toRemove = true;
912                                } else if((embedCount == 1 && contentLength < 75) || embedCount > 1) {
913                                        toRemove = true;
914                                }
915
916                                if ( img == 1 &&  p == 0 && contentLength == 0 ) {
917                                        Element theImg = (Element) ((Element) tagsList.item(i)).getElementsByTagName("img").item(0);  
918
919                                        String w = "";
920                                        if (theImg.getAttribute("width") != null) w = theImg.getAttribute("width");
921
922                                        String h = "";
923                                        if (theImg.getAttribute("height") != null) h = theImg.getAttribute("height");
924
925                                        if (!(w.equals("0") || h.equals("0")))
926                                                toRemove = false; //special case - it's just an inline image
927                                }
928
929                                if(toRemove) {
930                                        dbg("Removing " + tagsList.item(i) + " (" + ((Element) tagsList.item(i)).getAttribute("class")+ ":" + ((Element)tagsList.item(i)).getAttribute("id") + ")");
931                                        tagsList.item(i).getParentNode().removeChild(tagsList.item(i));
932                                }
933                        }
934                }
935        }
936
937        /**
938         * Get the number of times a string s appears in the node e.
939         *
940         * @param Element
941         * @param string - what to split on. Default is ","
942         * @return number (integer)
943         **/
944        protected int getCharCount(Element e, String s) {
945                return getInnerText(e).split(s).length-1;
946        }
947
948        protected int getCharCount(Element e) {
949                return getCharCount(e, ",");
950        }
951
952        /**
953         * @return The article title 
954         */
955        public String getArticleTitle() {
956                return articleTitle;
957        }
958
959        /**
960         * @return The content type of the article
961         */
962        public String getArticleContentType() {
963                return article_contentType;
964        }
965
966        /***
967         * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
968         *               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
969         *
970         * @return Element
971         **/
972        protected Element grabArticle() {
973                boolean stripUnlikelyCandidates = flags.contains(Flag.FLAG_STRIP_UNLIKELYS);
974
975                /**
976                 * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
977                 * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
978                 *
979                 * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
980                 * Todo: Shouldn't this be a reverse traversal?
981                 **/
982                Element node = null;
983                List<Element> nodesToScore = new ArrayList<Element>();
984                for(int nodeIndex = 0; (node = (Element)document.getElementsByTagName("*").item(nodeIndex)) != null; nodeIndex++)
985                {
986                        /* Remove unlikely candidates */
987                        if (stripUnlikelyCandidates) {
988                                String unlikelyMatchString = node.getAttribute("class") + node.getAttribute("id");
989                                if (search(unlikelyMatchString, Regexps.unlikelyCandidatesRe) != -1 &&
990                                                search(unlikelyMatchString, Regexps.okMaybeItsACandidateRe) == -1 &&
991                                                !node.getTagName().equals("BODY"))
992                                {
993                                        dbg("Removing unlikely candidate - " + unlikelyMatchString);
994                                        node.getParentNode().removeChild(node);
995                                        nodeIndex--;
996                                        continue;
997                                }               
998                        }
999
1000                        if (node.getTagName().equals("P") || node.getTagName().equals("TD")) {
1001                                nodesToScore.add(node);
1002                        }
1003
1004                        /* Turn all divs that don't have children block level elements into p's */
1005                        if (node.getTagName().equals("DIV")) {
1006
1007                                if (search(getInnerHTML(node), Regexps.divToPElementsRe) == -1) {
1008                                        dbg("Altering div to p");
1009                                        Element newNode = document.createElement("P");
1010
1011                                        //newNode.innerHTML = node.innerHTML;
1012                                        NodeList nl = node.getChildNodes();
1013                                        for (int i=0; i<nl.getLength(); i++) newNode.appendChild(nl.item(i));
1014
1015                                        node.getParentNode().replaceChild(newNode, node);
1016                                        nodeIndex--;
1017                                }
1018                                else
1019                                {
1020                                        /* EXPERIMENTAL */
1021                                        for(int i = 0, il = node.getChildNodes().getLength(); i < il; i++) {
1022                                                Node childNode = node.getChildNodes().item(i);
1023                                                if(childNode.getNodeType() == Element.TEXT_NODE) {
1024                                                        dbg("replacing text node with a p tag with the same content.");
1025                                                        Element p = document.createElement("p");
1026                                                        //p.innerHTML = childNode.nodeValue;
1027                                                        p.setNodeValue(childNode.getNodeValue());
1028                                                        p.setTextContent(childNode.getTextContent());
1029                                                        //p.style.display = "inline";
1030                                                        p.setAttribute("class", "readability-styled");
1031                                                        childNode.getParentNode().replaceChild(p, childNode);
1032                                                }
1033                                        }
1034                                }
1035                        } 
1036                }
1037
1038                /**
1039                 * Loop through all paragraphs, and assign a score to them based on how content-y they look.
1040                 * Then add their score to their parent node.
1041                 *
1042                 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
1043                 **/
1044                List<Element> candidates = new ArrayList<Element>();
1045                for (int pt=0; pt < nodesToScore.size(); pt++) {
1046                        Element parentNode      = (Element) nodesToScore.get(pt).getParentNode();
1047                        Element grandParentNode = (Element) parentNode.getParentNode();
1048                        String innerText        = getInnerText(nodesToScore.get(pt));
1049
1050                        /* If this paragraph is less than 25 characters, don't even count it. */
1051                        if(innerText.length() < 25) {
1052                                continue; 
1053                        }
1054
1055                        /* Initialize readability data for the parent. */
1056                        if(parentNode.getUserData("readability") == null)
1057                        {
1058                                initializeNode(parentNode);
1059                                candidates.add(parentNode);
1060                        }
1061
1062                        /* Initialize readability data for the grandparent. */
1063                        if(grandParentNode.getUserData("readability") == null)
1064                        {
1065                                initializeNode(grandParentNode);
1066                                candidates.add(grandParentNode);
1067                        }
1068
1069                        float contentScore = 0;
1070
1071                        /* Add a point for the paragraph itself as a base. */
1072                        contentScore++;
1073
1074                        /* Add points for any commas within this paragraph */
1075                        contentScore += innerText.split(",").length;
1076
1077                        /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
1078                        contentScore += Math.min(Math.floor((float)innerText.length() / 100F), 3F);
1079
1080                        /* Add the score to the parent. The grandparent gets half. */
1081                        parentNode.setUserData("readability", ((Float)(parentNode.getUserData("readability")) + contentScore), null);
1082                        grandParentNode.setUserData("readability", ((Float)(grandParentNode.getUserData("readability"))) + (contentScore/2F), null);
1083                }
1084
1085                /**
1086                 * After we've calculated scores, loop through all of the possible candidate nodes we found
1087                 * and find the one with the highest score.
1088                 **/
1089                Element topCandidate = null;
1090                for(int c=0, cl=candidates.size(); c < cl; c++)
1091                {
1092                        /**
1093                         * Scale the final candidates score based on link density. Good content should have a
1094                         * relatively small link density (5% or less) and be mostly unaffected by this operation.
1095                         **/
1096
1097                        candidates.get(c).setUserData("readability", (Float)(candidates.get(c).getUserData("readability")) * (1F-getLinkDensity(candidates.get(c))), null);
1098
1099                        dbg("Candidate: " + candidates.get(c) + " (" + candidates.get(c).getAttribute("class")+ ":" + candidates.get(c).getAttribute("id")+ ") with score " + candidates.get(c).getUserData("readability"));
1100
1101                        if(topCandidate == null || (Float)(candidates.get(c).getUserData("readability")) > ((Float)topCandidate.getUserData("readability"))) {
1102                                topCandidate = candidates.get(c);
1103                        }
1104                }
1105
1106                if (topCandidate != null)
1107                        dbg("==> TOP Candidate: " + topCandidate + " (" + topCandidate.getAttribute("class")+ ":" + topCandidate.getAttribute("id")+ ") with score " + topCandidate.getUserData("readability"));
1108                
1109                /**
1110                 * If we still have no top candidate, just use the body as a last resort.
1111                 * We also have to copy the body node so it is something we can modify.
1112                 **/
1113                if (topCandidate == null || topCandidate.getTagName().equals("BODY"))
1114                {
1115                        topCandidate = document.createElement("DIV");
1116
1117                        //topCandidate.innerHTML = document.body.innerHTML;
1118                        NodeList nl = getBody().getChildNodes();
1119                        for (int i=0; i<nl.getLength(); i++) topCandidate.appendChild(nl.item(i));
1120                        //document.body.innerHTML = ""; //should be covered by above
1121
1122
1123                        getBody().appendChild(topCandidate);
1124                        initializeNode(topCandidate);
1125                }
1126
1127                /**
1128                 * Now that we have the top candidate, look through its siblings for content that might also be related.
1129                 * Things like preambles, content split by ads that we removed, etc.
1130                 **/
1131                Element articleContent = document.createElement("DIV");
1132                articleContent.setAttribute("id", "readability-content");
1133                float siblingScoreThreshold = (float) Math.max(10F, (Float)topCandidate.getUserData("readability") * 0.2F);
1134                NodeList siblingNodes = topCandidate.getParentNode().getChildNodes();
1135                
1136                for(int s=0, sl=siblingNodes.getLength(); s < sl; s++)
1137                {
1138                        Node siblingNode = siblingNodes.item(s);
1139                        boolean append      = false;
1140
1141                        if (siblingNode instanceof Element)
1142                                dbg("Looking at sibling node: " + siblingNode + " (" + ((Element) siblingNode).getAttribute("class") + ":" + ((Element) siblingNode).getAttribute("id") + ")" + ((siblingNode.getUserData("readability") != null) ? (" with score " + siblingNode.getUserData("readability")) : ""));
1143                        dbg("Sibling has score " + (siblingNode.getUserData("readability") != null ? siblingNode.getUserData("readability") : "Unknown"));
1144
1145                        if(siblingNode == topCandidate)
1146                        {
1147                                append = true;
1148                        }
1149
1150                        float contentBonus = 0;
1151                        /* Give a bonus if sibling nodes and top candidates have the example same classname */
1152                        if(siblingNode instanceof Element && ((Element) siblingNode).getAttribute("class").equals(topCandidate.getAttribute("class")) && !topCandidate.getAttribute("class").equals("")) {
1153                                contentBonus += (Float)topCandidate.getUserData("readability") * 0.2F;
1154                        }
1155
1156                        if(siblingNode.getUserData("readability") != null && ((Float)siblingNode.getUserData("readability")+contentBonus) >= siblingScoreThreshold)
1157                        {
1158                                append = true;
1159                        }
1160
1161                        if(siblingNode.getNodeName().equals("P")) {
1162                                float linkDensity = getLinkDensity((Element) siblingNode);
1163                                String nodeContent = getInnerText((Element) siblingNode);
1164                                int nodeLength  = nodeContent.length();
1165
1166                                if(nodeLength > 80 && linkDensity < 0.25)
1167                                {
1168                                        append = true;
1169                                }
1170                                else if(nodeLength < 80 && linkDensity == 0 && search(nodeContent, "\\.( |$)") != -1)
1171                                {
1172                                        append = true;
1173                                }
1174                        }
1175
1176                        if(append)
1177                        {
1178                                dbg("Appending node: " + siblingNode);
1179
1180                                Node nodeToAppend = null;
1181                                if(!siblingNode.getNodeName().equals("DIV") && !siblingNode.getNodeName().equals("P")) {
1182                                        /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
1183
1184                                        dbg("Altering siblingNode of " + siblingNode.getNodeName() + " to div.");
1185                                        nodeToAppend = document.createElement("div");
1186                                        if (siblingNode instanceof Element)
1187                                                ((Element) nodeToAppend).setAttribute("id", ((Element) siblingNode).getAttribute("id"));
1188
1189                                        //nodeToAppend.innerHTML = siblingNode.innerHTML;
1190                                        NodeList nl = siblingNode.getChildNodes();
1191                                        for (int i=0; i<nl.getLength(); i++) nodeToAppend.appendChild(nl.item(i));
1192                                } else {
1193                                        nodeToAppend = siblingNode;
1194                                        s--;
1195                                        sl--;
1196                                }
1197
1198                                /* To ensure a node does not interfere with readability styles, remove its classnames */
1199                                if (nodeToAppend instanceof Element)
1200                                        ((Element) nodeToAppend).setAttribute("class", "");
1201
1202                                /* Append sibling and subtract from our list because it removes the node when you append to another node */
1203                                articleContent.appendChild(nodeToAppend);
1204                        }
1205                }
1206
1207                /**
1208                 * So we have all of the content that we need. Now we clean it up for presentation.
1209                 **/
1210                prepArticle(articleContent);
1211
1212                return articleContent;
1213        }
1214
1215        protected String getInnerHTML(Node n) {
1216                if (n.getNodeType() == Node.TEXT_NODE) return n.getTextContent();
1217
1218                String result = "";             
1219                NodeList nl = n.getChildNodes();
1220                for (int i=0; i<nl.getLength(); i++) {
1221                        if (nl.item(i).getNodeType() == Node.TEXT_NODE)
1222                                result += nl.item(i).getTextContent();
1223                        else if (nl.item(i).getNodeType() == Node.COMMENT_NODE)
1224                                result += "<!-- " + nl.item(i).getTextContent() + " -->";
1225                        else
1226                                result += nodeToString(nl.item(i));                             
1227                }
1228
1229                return result;
1230        }
1231
1232        protected String nodeToString(Node n) {
1233                return nodeToString(n, false);
1234        }
1235
1236        protected String nodeToString(Node n, boolean pretty) {
1237                try {
1238                        DOMImplementationRegistry registry = DOMImplementationRegistry.newInstance();
1239                        DOMImplementationLS impl = (DOMImplementationLS)registry.getDOMImplementation("LS");
1240                        LSSerializer writer = impl.createLSSerializer();
1241
1242                        writer.getDomConfig().setParameter("xml-declaration", false);
1243                        if (pretty) {
1244                                writer.getDomConfig().setParameter("format-pretty-print", true);
1245                        }
1246
1247                        return writer.writeToString(n);
1248                } catch (Exception e) {
1249                        throw new RuntimeException(e);
1250                }
1251        }
1252
1253        protected Node stringToNode(String str) {
1254                try {
1255                        DOMFragmentParser parser = new DOMFragmentParser();
1256                        DocumentFragment fragment = document.createDocumentFragment();
1257                        parser.parse(new InputSource(new StringReader(str)), fragment);
1258                        return fragment;
1259
1260                        //try and return the element itself if possible...
1261                        //                      NodeList nl = fragment.getChildNodes();
1262                        //                      for (int i=0; i<nl.getLength(); i++) if (nl.item(i).getNodeType() == Node.ELEMENT_NODE) return nl.item(i);
1263                        //                      return fragment;
1264
1265                } catch (Exception e) {
1266                        throw new RuntimeException(e);
1267                }
1268        }
1269
1270        /**
1271         * Get the inner text of a node - cross browser compatibly.
1272         * This also strips out any excess whitespace to be found.
1273         *
1274         * @param Element
1275         * @return string
1276         **/
1277        protected String getInnerText(Element e, boolean normalizeSpaces) {
1278                String textContent    = "";
1279
1280                textContent = e.getTextContent().replaceAll( Regexps.trimRe, "" );
1281
1282                if(normalizeSpaces) {
1283                        return textContent.replaceAll( Regexps.normalizeRe, " "); 
1284                } else {
1285                        return textContent; 
1286                }
1287        }
1288
1289        protected String getInnerTextSep(Node e) {
1290                if (e.hasChildNodes()) {
1291                        String s = "";
1292                        NodeList nl = e.getChildNodes();
1293                        for (int i=0; i<nl.getLength(); i++) {
1294                                if (!nl.item(i).getNodeName().equalsIgnoreCase("script"))
1295                                        s += getInnerTextSep(nl.item(i));
1296                        }
1297                        return s;
1298                } else {
1299                        return e.getTextContent() + " ";
1300                }
1301        }
1302
1303        protected String getInnerText(Element e) {
1304                return getInnerText(e, true);
1305        }
1306
1307        /**
1308         * @return The article HTML content as a {@link String}.
1309         */
1310        public String getArticleHTML() {
1311                if (articleContent == null) return "";
1312                return nodeToString(articleContent, true);
1313        }
1314
1315        /**
1316         * @return The articles HTML dom node. 
1317         */
1318        public Node getArticleHTML_DOM() {
1319                return articleContent;
1320        }
1321
1322        protected String getArticleDateString() {
1323                return article_date_string;
1324        }
1325
1326        /**
1327         * @return The article date.
1328         */
1329        public Date getArticleDate() {
1330                return article_date;
1331        }
1332
1333        /**
1334         * @return The text of the article.
1335         */
1336        public String getArticleText() {
1337                if (articleContent == null) return "Unable to find article content";
1338                //return getInnerText(articleContent, false);
1339                return articleContent.getTextContent().trim().replaceAll("[\r|\n|\r\n]{2,}", "\n\n").replaceAll(" {2,}", " ");
1340        }
1341
1342        /**
1343         * @return Any links in the article.
1344         */
1345        public List<Anchor> getArticleLinks() {           
1346                List<Anchor> anchors = new ArrayList<Anchor>();
1347                if (articleContent == null) return anchors;
1348
1349                NodeList nl = articleContent.getElementsByTagName("a");
1350                for (int i=0; i<nl.getLength(); i++) {
1351                        Element a = (Element) nl.item(i);
1352
1353                        Anchor anchor = new Anchor(getInnerText(a), a.getAttribute("href"));
1354                        anchors.add(anchor);
1355                }
1356                return anchors;
1357        }
1358
1359        /**
1360         * @return Any links in the document.
1361         */
1362        public List<Anchor> getAllLinks() {               
1363                List<Anchor> anchors = new ArrayList<Anchor>();
1364
1365                NodeList nl = document.getElementsByTagName("a");
1366                for (int i=0; i<nl.getLength(); i++) {
1367                        Element a = (Element) nl.item(i);
1368                        Anchor anchor = new Anchor(getInnerText(a), a.getAttribute("href"));
1369                        anchors.add(anchor);
1370                }
1371                return anchors;
1372        }
1373
1374        /**
1375         * @return Any images in the article.
1376         */
1377        public List<String> getArticleImages() {
1378                List<String> images = new ArrayList<String>();
1379                if (articleContent == null) return images;
1380
1381                NodeList nl = articleContent.getElementsByTagName("img");
1382                for (int i=0; i<nl.getLength(); i++) {
1383                        Element img = (Element) nl.item(i);
1384                        images.add(img.getAttribute("src"));
1385                }
1386                return images;
1387        }
1388
1389        /**
1390         * @return Any subheadings in the article.
1391         */
1392        public List<String> getArticleSubheadings() {
1393                List<String> subtitles = new ArrayList<String>();
1394                if (articleContent == null) return subtitles;
1395
1396                for (int j=1; j<=6; j++) {
1397                        NodeList nl = articleContent.getElementsByTagName("h"+j);
1398                        if (nl.getLength() > 0) {
1399                                for (int i=0; i<nl.getLength(); i++) {
1400                                        subtitles.add(nl.item(i).getTextContent());
1401                                }
1402                                break;
1403                        }
1404                }
1405
1406                if (subtitles.size() == 0) {
1407                        //try looking for other likely-looking elements
1408
1409                        NodeList nl = articleContent.getElementsByTagName("*");
1410                        for (int i=0; i<nl.getLength(); i++) {
1411                                if (nl.item(i) instanceof Element &&
1412                                                ((Element) nl.item(i)).getAttribute("class") != null && 
1413                                                search(((Element) nl.item(i)).getAttribute("class"), Regexps.likelySubheadCandidateRe) != -1)
1414                                        subtitles.add(nl.item(i).getTextContent());
1415                        }
1416                }
1417
1418                return subtitles;
1419        }
1420
1421        protected List<Node> findChildNodesWithName(Node parent, String name) {
1422                NodeList children = parent.getChildNodes();
1423                List<Node> results = new ArrayList<Node>();
1424
1425                for (int i = 0; i < children.getLength(); ++i) {
1426                        Node child = children.item(i);
1427                        if (child == null)
1428                                continue;
1429
1430                        String nodeName = child.getNodeName();
1431                        if (nodeName == null)
1432                                continue;
1433
1434                        if (nodeName.equals(name)) {
1435                                results.add(child);
1436                        }
1437                }
1438                return results;
1439        }
1440
1441        protected int findChildNodeIndex( Node parent, Node childToFind )
1442        {
1443                for( int index = 0; index < parent.getChildNodes().getLength(); index++ )
1444                        if( parent.getChildNodes().item( index ) == childToFind )
1445                                return index;
1446                return -1;
1447        }
1448        
1449        protected void getArticleTextMapping(TreeWalker walker, List<MappingNode> map) throws DOMException {
1450                Node parend = walker.getCurrentNode();
1451
1452                if( parend.getNodeType() == Node.TEXT_NODE && parend.getParentNode().getAttributes().getNamedItem("id") != null )
1453                {
1454                        if( parend.getTextContent().trim().length() > 0 )
1455                        {
1456                                int index = findChildNodeIndex( parend.getParentNode(), parend );
1457                                if( index != -1 )
1458                                {
1459                                        // square brackets are not valid XML/HTML identifier characters, so we can use them here
1460                                        map.add( new MappingNode( 
1461                                                        parend.getParentNode().getAttributes().getNamedItem("id").getNodeValue() + "["+index+"]", 
1462                                                        parend.getNodeValue() ) );
1463                                
1464//                                      System.out.println( "ELEMENT '"+parend.getParentNode().getAttributes().getNamedItem("id").getNodeValue() + "["+index+"]"+"'");
1465//                                      System.out.println( "VALUE:  '"+parend.getNodeValue()+"'" );
1466                                }
1467                        }
1468                }
1469
1470                // traverse children:
1471                for (Node n = walker.firstChild(); n != null; n = walker.nextSibling()) {
1472                        getArticleTextMapping(walker, map);
1473                }
1474
1475                // return position to the current (level up):
1476                walker.setCurrentNode(parend);
1477        }
1478
1479        protected class MappingNode {
1480                String id;
1481                String text;
1482                
1483                public MappingNode(String id, String text) { this.id = id; this.text = text; }
1484                public String getId() { return id; }
1485                public String getText() { return text; }
1486                @Override public String toString() { return "MappingNode(" + id + " -> " + text + ")"; }
1487        }
1488        
1489        /**
1490         * Get the mapping between bits of text in the dom & their xpaths
1491         * 
1492         * @return mapping from xpath to text
1493         */
1494        public List<MappingNode> getArticleTextMapping() {
1495                if (articleContent == null) return null;
1496
1497                List<MappingNode> map = new ArrayList<MappingNode>();
1498
1499                TreeWalker walker = ((DocumentTraversal) document).createTreeWalker(articleContent, NodeFilter.SHOW_TEXT | NodeFilter.SHOW_ELEMENT, null, true);
1500
1501                getArticleTextMapping(walker, map);
1502
1503                return map;
1504        }
1505
1506        /**
1507         * Convenience method to build a {@link Readability} instance from an html string.
1508         * @param html The html string
1509         * @return new {@link Readability} instance.
1510         * @throws SAXException
1511         * @throws IOException
1512         */
1513        public static Readability getReadability(String html) throws SAXException, IOException {
1514                return getReadability( html, false );
1515        }
1516
1517        /**
1518         * Convenience method to build a {@link Readability} instance from an html string.
1519         * @param html The html string
1520         * @param addTitle Should the title be added to the generated article?
1521         * @return new {@link Readability} instance.
1522         * @throws SAXException
1523         * @throws IOException
1524         */
1525        public static Readability getReadability(String html, boolean addTitle) throws SAXException, IOException {
1526                DOMParser parser = new DOMParser();
1527                parser.parse(new InputSource(new StringReader(html)));
1528
1529                return new Readability(parser.getDocument(), false, addTitle );
1530        }
1531        
1532        /**
1533         * Testing
1534         * @param argv
1535         * @throws Exception
1536         */
1537        public static void main(String[] argv) throws Exception {
1538//              URL input = new URL("file:///home/dd/Programming/Readability4J/t.html");
1539                                                                URL input = new URL("http://news.bbc.co.uk/1/hi/politics/10362367.stm");
1540                //                                              URL input = new URL("http://euobserver.com/9/30465");
1541                //                                              URL input = new URL("http://euobserver.com/?aid=23383");
1542                //                              URL input = new URL("http://abandoninplace.squarespace.com/blog/2010/6/8/wwdc-monday.html");
1543                //                              URL input = new URL("file:///Users/jsh2/Desktop/test.html");
1544                //                              URL input = new URL("http://mobile.engadget.com/2010/06/17/htc-aria-review/");
1545                //                              URL input = new URL("http://thedailywtf.com/Articles/Benched.aspx");
1546                //                              URL input = new URL("http://www.dailymail.co.uk/news/article-1287625/Woman-sparked-150-000-manhunt-slashing-face-crying-rape-faces-jail.html");
1547                //URL input = new URL("http://mrpaparazzi.com/post/11619/Lindsay-Lohan-Tests-Negative-For-Alcohol-Goes-Clubbing-To-Celebrate.aspx");
1548                //URL input = new URL("http://www.bbc.co.uk/news/world-middle-east-11415719");
1549                //URL input = new URL("http://www.thebigproject.co.uk/news/");
1550//              URL input = new URL("http://blogs.euobserver.com/popescu/2009/12/15/on-euro-optimism-pessimism-and-failures/#more-958");
1551                //URL input = new URL("http://www.cnn.com/2010/WORLD/meast/09/27/west.bank.settlement.construction/index.html?hpt=T2");
1552
1553                //URL input = new URL("http://www.huffingtonpost.com/steven-cohen/its-time-to-enact-congest_b_740315.html");
1554                //                              URL input = new URL("http://uk.mac.ign.com/articles/573/573319p1.html");
1555                DOMParser parser = new DOMParser();
1556                parser.parse(new InputSource(input.openStream()));
1557
1558                Readability r = new Readability(parser.getDocument(), false, true);
1559
1560                //System.out.println(r.getArticleTitle());
1561//              System.out.println(r.getArticleHTML());
1562                //System.out.println(r.getAllLinks());
1563                System.out.println(r.getArticleText());
1564
1565                System.out.println();
1566                System.out.println("***");
1567                System.out.println();
1568                
1569                for (MappingNode s : r.getArticleTextMapping())
1570                        System.out.println(s);
1571
1572                //PrintStream out = new PrintStream("news-sites");
1573                //for (Anchor anchor : r.getAllLinks()) {
1574                //      out.println(anchor.getHref() + "\t"  + anchor.getText());
1575                //}
1576                //out.close();
1577
1578                //System.out.println(r.getArticleImages());
1579                //              System.out.println(r.getArticleSubheadings());
1580                //              System.out.println(r.getArticleHTML());
1581                //              System.out.println(r.getArticleHTML_DOM());
1582
1583                //System.out.println(r.getArticleDateString());
1584                //System.out.println(r.getArticleDate());
1585        }       
1586}