001/** 002 * Copyright 2010 The University of Southampton, Yahoo Inc., and the 003 * individual contributors. All rights reserved. 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.openimaj.web.readability; 018 019import java.io.IOException; 020import java.io.StringReader; 021import java.net.URL; 022import java.text.ParseException; 023import java.text.SimpleDateFormat; 024import java.util.ArrayList; 025import java.util.Date; 026import java.util.EnumSet; 027import java.util.List; 028import java.util.regex.Matcher; 029import java.util.regex.Pattern; 030 031import org.cyberneko.html.parsers.DOMFragmentParser; 032import org.cyberneko.html.parsers.DOMParser; 033import org.pojava.datetime.DateTime; 034import org.w3c.dom.DOMException; 035import org.w3c.dom.Document; 036import org.w3c.dom.DocumentFragment; 037import org.w3c.dom.Element; 038import org.w3c.dom.Node; 039import org.w3c.dom.NodeList; 040import org.w3c.dom.bootstrap.DOMImplementationRegistry; 041import org.w3c.dom.ls.DOMImplementationLS; 042import org.w3c.dom.ls.LSSerializer; 043import org.w3c.dom.traversal.DocumentTraversal; 044import org.w3c.dom.traversal.NodeFilter; 045import org.w3c.dom.traversal.TreeWalker; 046import org.xml.sax.InputSource; 047import org.xml.sax.SAXException; 048 049/** 050 * Class for extracting the "content" from web-pages, and ignoring adverts, etc. 051 * Based upon readability.js (http://lab.arc90.com/experiments/readability/) and 052 * modified to behave better for certain sites (and typically better mimic Safari 053 * Reader functionality). 054 * 055 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 056 * @author Michael Matthews (mikemat@yahoo-inc.com) 057 * @author David Dupplaw (dpd@ecs.soton.ac.uk) 058 */ 059public class Readability 060{ 061 /** 062 * Regular expressions for different types of content 063 */ 064 protected static class Regexps { 065 066 public static String unlikelyCandidatesRe = "(?i)combx|comment|disqus|foot|header|menu|rss|shoutbox|sidebar|sponsor|story-feature|banner"; //caption? 067 public static String okMaybeItsACandidateRe = "(?i)and|comments|article|body|column|main"; 068 public static String positiveRe = "(?i)article|body|comments|content|entry|hentry|page|pagination|post|text"; 069 public static String negativeRe= "(?i)combx|comment|contact|foot|footer|footnote|link|masthead|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget|warning"; 070 public static String divToPElementsRe = "(?i)(a|blockquote|dl|div|img|ol|p|pre|table|ul)"; 071 public static String replaceBrsRe = "(?i)(<br[^>]*>[ \n\r\t]*){2,}"; 072 public static String replaceFontsRe ="(?i)<(\\/?)font[^>]*>"; 073 public static String trimRe = "^\\s+|\\s+$"; 074 public static String normalizeRe = "\\s{2,}"; 075 public static String killBreaksRe = "(<br\\s*\\/?>(\\s| ?)*){1,}"; 076 public static String videoRe = "(?i)http:\\/\\/(www\\.)?(youtube|vimeo)\\.com"; 077 078 public static String titleSeparatorRe = "\\|\\-\\/"; 079 080 //this is used to try and find elements that represent sub-headings (that are not h1..h6) 081 public static String likelySubheadCandidateRe = "(?i)cross-head"; 082 } 083 084 enum Flag { 085 FLAG_STRIP_UNLIKELYS, 086 FLAG_WEIGHT_CLASSES 087 } 088 089 /** 090 * Threshold for removing elements with lots of links 091 */ 092 public static float LINK_DENSITY_THRESHOLD = 0.33F; 093 094 095 //IVARS below 096 protected Document document; 097 private Node bodyCache; 098 protected EnumSet<Flag> flags = EnumSet.allOf(Flag.class); 099 100 protected String articleTitle; 101 protected Element articleContent; 102 protected String article_date_string; 103 protected Date article_date; 104 protected String article_contentType; 105 106 protected boolean debug = false; 107 108 protected boolean addTitle = false; 109 110 /** 111 * Construct with the given document. Debugging is disabled. 112 * @param document The document. 113 */ 114 public Readability(Document document) { 115 this(document, false); 116 } 117 118 /** 119 * Construct with the given document. The second argument can be used to enable 120 * debugging output. 121 * @param document The document. 122 * @param debug Enable debugging output. 123 */ 124 public Readability(Document document, boolean debug) { 125 this(document, debug, false); 126 } 127 128 /** 129 * Construct with the given document. The second argument can be used to enable 130 * debugging output. The third option controls whether the title should be 131 * included in the output. 132 * @param document The document. 133 * @param debug Enable debugging output. 134 * @param addTitle Add title to output. 135 */ 136 public Readability(Document document, boolean debug, boolean addTitle) { 137 this.debug = debug; 138 this.document = document; 139 this.addTitle = addTitle; 140 augmentDocument(document); 141 init(); 142 } 143 144 /** 145 * Iterates through all the ELEMENT nodes in a document 146 * and gives them ids if they don't already have them. 147 * 148 * @param document 149 */ 150 public static void augmentDocument(Document document) { 151 DocumentTraversal traversal = (DocumentTraversal) document; 152 153 TreeWalker walker = traversal.createTreeWalker(document, NodeFilter.SHOW_ELEMENT, null, true); 154 155 traverseLevel(walker, 0); 156 } 157 158 private static int traverseLevel(TreeWalker walker, int counter) { 159 // describe current node: 160 Node parend = walker.getCurrentNode(); 161 162 if (parend instanceof Element) { 163 if (((Element)parend).getAttribute("id").length() == 0) { 164 ((Element)parend).setAttribute("id", "gen-id-"+counter); 165 counter++; 166 } 167 } 168 169 // traverse children: 170 for (Node n = walker.firstChild(); n != null; 171 n = walker.nextSibling()) { 172 counter = traverseLevel(walker, counter); 173 } 174 175 // return position to the current (level up): 176 walker.setCurrentNode(parend); 177 178 return counter; 179 } 180 181 protected void dbg(String s) { 182 if (debug) 183 System.err.println(s); 184 } 185 186 protected String getTitle() { 187 NodeList l = document.getElementsByTagName("title"); 188 189 if (l.getLength() == 0) return ""; 190 191 return l.item(0).getTextContent(); 192 } 193 194 /** 195 * Javascript-like String.match 196 * @param input 197 * @param regex 198 * @return 199 */ 200 protected String[] match(String input, String regex) { 201 Matcher matcher = Pattern.compile(regex).matcher(input); 202 List<String> matches = new ArrayList<String>(); 203 204 while ( matcher.find() ) { 205 matches.add(matcher.group(0)); 206 } 207 208 return matches.toArray(new String[matches.size()]); 209 } 210 211 /** 212 * @return True if the article has any detected content; false otherwise. 213 */ 214 public boolean hasContent() { 215 return articleContent != null; 216 } 217 218 /** 219 * Javascript-like String.search 220 * @param input 221 * @param regex 222 * @return 223 */ 224 protected int search(String input, String regex) { 225 Matcher matcher = Pattern.compile(regex).matcher(input); 226 227 if (!matcher.find()) return -1; 228 return matcher.start(); 229 } 230 231 232 protected void findArticleEncoding() { 233 NodeList nl = document.getElementsByTagName("meta"); 234 for (int j=0; j<nl.getLength(); j++) { 235 if (((Element)nl.item(j)).getAttribute("http-equiv").equals("Content-Type")) { 236 article_contentType = ((Element)nl.item(j)).getAttribute("content"); 237 return; 238 } 239 } 240 241 } 242 243 protected void findArticleDate() { 244 //<meta name="OriginalPublicationDate" content="2010/07/12 14:08:02"/> 245 //<meta name="DC.date.issued" content="2010-07-12"> 246 NodeList nl = document.getElementsByTagName("meta"); 247 for (int j=0; j<nl.getLength(); j++) { 248 if (((Element)nl.item(j)).getAttribute("name").equals("OriginalPublicationDate")) { 249 article_date_string = ((Element)nl.item(j)).getAttribute("content"); 250 article_date = DateTime.parse(article_date_string).toDate(); 251 return; 252 } 253 if (((Element)nl.item(j)).getAttribute("name").equals("DC.date.issued")) { 254 article_date_string = ((Element)nl.item(j)).getAttribute("content"); 255 article_date = DateTime.parse(article_date_string).toDate(); 256 return; 257 } 258 } 259 260 //<time datetime="2010-07-12T10:26BST" pubdate>Monday 12 July 2010 10.26 BST</time> 261 nl = document.getElementsByTagName("time"); 262 for (int j=0; j<nl.getLength(); j++) { 263 if (((Element)nl.item(j)).getAttributeNode("pubdate") != null) { 264 article_date_string = ((Element)nl.item(j)).getAttribute("datetime"); 265 article_date = DateTime.parse(article_date_string).toDate(); 266 return; 267 } 268 } 269 270 //<span class="date">14:08 GMT, Monday, 12 July 2010 15:08 UK</span> 271 //<p class="date">09.07.2010 @ 17:49 CET</p> 272 //<p class="date">Today @ 09:29 CET</p> 273 nl = document.getElementsByTagName("*"); 274 for (int j=0; j<nl.getLength(); j++) { 275 if ((((Element)nl.item(j)).getAttribute("class").contains("date") || 276 ((Element)nl.item(j)).getAttribute("class").contains("Date") ) && 277 !(((Element)nl.item(j)).getAttribute("class").contains("update") || 278 ((Element)nl.item(j)).getAttribute("class").contains("Update")) 279 ) { 280 article_date_string = getInnerTextSep((Element)nl.item(j)).trim(); 281 parseDate(); 282 return; 283 } 284 } 285 for (int j=0; j<nl.getLength(); j++) { 286 if ((((Element)nl.item(j)).getAttribute("id").contains("date") || 287 ((Element)nl.item(j)).getAttribute("id").contains("Date") ) && 288 !(((Element)nl.item(j)).getAttribute("id").contains("update") || 289 ((Element)nl.item(j)).getAttribute("id").contains("Update")) 290 ) { 291 article_date_string = getInnerTextSep((Element)nl.item(j)).trim(); 292 parseDate(); 293 return; 294 } 295 } 296 297 //Last updated at 3:05 PM on 12th July 2010 298 nl = document.getElementsByTagName("*"); 299 for (int j=0; j<nl.getLength(); j++) { 300 String text = nl.item(j).getTextContent(); 301 302 if (text == null) 303 continue; 304 305 Pattern p = Pattern.compile("Last updated at (\\d+:\\d\\d [AP]M on \\d+[thsndr]+ \\w+ \\d\\d\\d\\d)"); 306 Matcher m = p.matcher(text); 307 if (m.find()) { 308 article_date_string = m.group(1); 309 310 String cpy = article_date_string.replaceAll("th", ""); 311 cpy = cpy.replaceAll("st", ""); 312 cpy = cpy.replaceAll("nd", ""); 313 cpy = cpy.replaceAll("rd", ""); 314 315 SimpleDateFormat sdf = new SimpleDateFormat("h:mm a 'on' dd MMMM yyyy"); 316 try { article_date = sdf.parse(cpy); } catch (ParseException e) {} 317 return; 318 } 319 } 320 } 321 322 @SuppressWarnings("deprecation") 323 protected void parseDate() { 324 if (article_date_string == null || article_date_string.trim().isEmpty() ) return; 325 326 if (article_date_string.contains("Today")) { 327 try { 328 SimpleDateFormat sdf = new SimpleDateFormat("'Today @' HH:mm z"); 329 article_date = sdf.parse(article_date_string); 330 Date now = new Date(); 331 article_date.setDate(now.getDate()); 332 article_date.setMonth(now.getMonth()); 333 article_date.setYear(now.getYear()); 334 } catch (ParseException e) {} 335 } else { 336 try { 337 SimpleDateFormat sdf = new SimpleDateFormat("h:mm z',' E',' dd M yyyy"); 338 article_date = sdf.parse(article_date_string); 339 } catch (ParseException e) { 340 try { 341 SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy '@' HH:mm z"); 342 article_date = sdf.parse(article_date_string); 343 } catch (ParseException ee) { 344 try { 345 SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy"); 346 article_date = sdf.parse(article_date_string); 347 } catch (ParseException eee) { 348 try { 349 article_date = DateTime.parse(article_date_string).toDate(); 350 } catch (IllegalArgumentException ie) { 351 } catch (java.lang.ArrayIndexOutOfBoundsException ie) { 352 System.out.println(article_date_string); 353 } 354 } 355 } 356 } 357 } 358 } 359 360 /** 361 * Get the article title. 362 * 363 * @return void 364 **/ 365 protected String findArticleTitle() { 366 String curTitle = "", origTitle = ""; 367 368 curTitle = origTitle = getTitle(); 369 370 // 371 List<String> potentialTitles = new ArrayList<String>(); 372 for (int i=1; i<=6; i++) { 373 NodeList nl = document.getElementsByTagName("h"+i); 374 if (nl.getLength() > 0) { 375 for (int j=0; j<nl.getLength(); j++) 376 potentialTitles.add(nl.item(j).getTextContent().trim()); 377 } 378 } 379 380 String potentialTitle = null; 381 int score = 0; 382 for (String s : potentialTitles) { 383 if (s.length()>score && curTitle.contains(s)) { 384 potentialTitle = s; 385 score = s.length(); 386 } 387 } 388 if (potentialTitle != null) return potentialTitle; 389 // 390 391 if(match(curTitle, " ["+Regexps.titleSeparatorRe+"]+ ").length > 0) 392 { 393 curTitle = origTitle.replaceAll("(.*) ["+Regexps.titleSeparatorRe+"]+ .*", "$1"); 394 395 if(curTitle.split(" ").length < 3) { 396 curTitle = origTitle.replaceAll("(?i)[^"+Regexps.titleSeparatorRe+"]*["+Regexps.titleSeparatorRe+"]+(.*)", "$1"); 397 } 398 } 399 else if(curTitle.indexOf(": ") != -1) 400 { 401 curTitle = origTitle.replaceAll("(?i).*:(.*)", "$1"); 402 403 if(curTitle.split(" ").length < 3) { 404 curTitle = origTitle.replaceAll("(?i)[^:]*[:](.*)", "$1"); 405 } 406 } 407 else if(curTitle.length() > 150 || curTitle.length() < 15) 408 { 409 NodeList hOnes = document.getElementsByTagName("h1"); 410 if(hOnes.getLength() == 1) 411 { 412 curTitle = getInnerText((Element) hOnes.item(0)); 413 } 414 } 415 416 curTitle = curTitle.replaceAll( Regexps.trimRe, "" ); 417 418 if(curTitle.split(" ").length <= 3) { 419 curTitle = origTitle; 420 } 421 422 return curTitle; 423 } 424 425 /** 426 * Equivalent to document.body in JS 427 * @return 428 */ 429 protected Element getBody() { 430 NodeList nl = document.getElementsByTagName("body"); 431 432 if (nl.getLength() == 0) 433 return null; 434 else 435 return (Element) nl.item(0); 436 } 437 438 /** 439 * Runs readability. 440 * 441 * Workflow: 442 * 1. Prep the document by removing script tags, css, etc. 443 * 2. Build readability"s DOM tree. 444 * 3. Grab the article content from the current dom tree. 445 * 4. Replace the current DOM tree with the new one. 446 * 5. Read peacefully. 447 * 448 **/ 449 protected void init() { 450 if(getBody() != null && bodyCache == null) { 451 bodyCache = getBody().cloneNode(true); } 452 453 findArticleDate(); //must be done before prepDocument() 454 455 findArticleEncoding(); 456 457 prepDocument(); 458 459 /* Build readability"s DOM tree */ 460 articleTitle = findArticleTitle(); 461 articleContent = grabArticle(); 462 463 /** 464 * If we attempted to strip unlikely candidates on the first run through, and we ended up with no content, 465 * that may mean we stripped out the actual content so we couldn"t parse it. So re-run init while preserving 466 * unlikely candidates to have a better shot at getting our content out properly. 467 **/ 468 if(getInnerText(articleContent, false).length() < 250) 469 { 470 if (flags.contains(Flag.FLAG_STRIP_UNLIKELYS)) { 471 flags.remove(Flag.FLAG_STRIP_UNLIKELYS); 472 getBody().getParentNode().replaceChild(bodyCache, getBody()); 473 init(); 474 return; 475 } 476 else if (flags.contains(Flag.FLAG_WEIGHT_CLASSES)) { 477 flags.remove(Flag.FLAG_WEIGHT_CLASSES); 478 getBody().getParentNode().replaceChild(bodyCache, getBody()); 479 init(); 480 return; 481 } 482 else { 483 articleContent = null; 484 } 485 } 486 487 if (addTitle && articleContent != null) { 488 Element titleNode = document.createElement("h1"); 489 titleNode.setAttribute("id", "title"); 490 titleNode.appendChild(document.createTextNode(getArticleTitle())); 491 articleContent.insertBefore(titleNode, articleContent.getFirstChild()); 492 } 493 } 494 495 /** 496 * Prepare the HTML document for readability to scrape it. 497 * This includes things like stripping javascript, CSS, and handling terrible markup. 498 * 499 **/ 500 protected void prepDocument() { 501 /** 502 * In some cases a body element can"t be found (if the HTML is totally hosed for example) 503 * so we create a new body node and append it to the document. 504 */ 505 if(getBody() == null) 506 { 507 Node body = document.createElement("body"); 508 document.appendChild(body); 509 } 510 511 //frames are not supported in this version! 512 // NodeList frames = document.getElementsByTagName("frame"); 513 // if(frames.length > 0) 514 // { 515 // Node bestFrame = null; 516 // int bestFrameSize = 0; 517 // for(int frameIndex = 0; frameIndex < frames.getLength(); frameIndex++) 518 // { 519 // int frameSize = frames.item(frameIndex).offsetWidth + frames[frameIndex].offsetHeight; 520 // var canAccessFrame = false; 521 // try { 522 // frames[frameIndex].contentWindow.document.body; 523 // canAccessFrame = true; 524 // } 525 // catch(eFrames) { 526 // dbg(eFrames); 527 // } 528 // 529 // if(canAccessFrame && frameSize > bestFrameSize) 530 // { 531 // bestFrame = frames[frameIndex]; 532 // bestFrameSize = frameSize; 533 // } 534 // } 535 // 536 // if(bestFrame) 537 // { 538 // var newBody = document.createElement("body"); 539 // newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML; 540 // newBody.style.overflow = "scroll"; 541 // document.body = newBody; 542 // 543 // var frameset = document.getElementsByTagName("frameset")[0]; 544 // if(frameset) { 545 // frameset.parentNode.removeChild(frameset); } 546 // 547 // readability.frameHack = true; 548 // } 549 // } 550 551 /* remove all scripts that are not readability */ 552 NodeList scripts = document.getElementsByTagName("script"); 553 for(int i = scripts.getLength()-1; i >= 0; i--) 554 { 555 scripts.item(i).getParentNode().removeChild(scripts.item(i)); 556 } 557 558 /* Remove all style tags in head */ 559 NodeList styleTags = document.getElementsByTagName("style"); 560 for (int st=0;st < styleTags.getLength(); st++) { 561 styleTags.item(st).getParentNode().removeChild(styleTags.item(st)); 562 } 563 564 /* Remove all meta tags */ 565 NodeList metaTags = document.getElementsByTagName("meta"); 566 for (int mt=0;mt < metaTags.getLength(); mt++) { 567 metaTags.item(mt).getParentNode().removeChild(metaTags.item(mt)); 568 } 569 570 /* Turn all double br's into p's */ 571 /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ 572 //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrsRe, '</p><p>').replace(readability.regexps.replaceFontsRe, '<$1span>'); 573 Element body = getBody(); 574 // Node rep = stringToNode(nodeToString(body).replaceAll(Regexps.replaceBrsRe, "</P><P>").replaceAll(Regexps.replaceFontsRe, "<$1span>")); 575 // body.getParentNode().replaceChild(rep, body); 576 577 //This is slow! 578 Node frag = stringToNode(getInnerHTML(body).replaceAll(Regexps.replaceBrsRe, "</P><P>").replaceAll(Regexps.replaceFontsRe, "<$1span>")); 579 removeChildren(body); 580 body.appendChild(frag); 581 582 /* Remove all comments */ 583 removeComments(document); 584 } 585 586 protected void removeComments(Node n) { 587 if (n.getNodeType() == Node.COMMENT_NODE) { 588 n.getParentNode().removeChild(n); 589 } else { 590 NodeList nl = n.getChildNodes(); 591 for (int i=0; i<nl.getLength(); i++) 592 removeComments(nl.item(i)); 593 } 594 } 595 596 /** 597 * Prepare the article node for display. Clean out any inline styles, 598 * iframes, forms, strip extraneous <p> tags, etc. 599 * 600 * @param Element 601 **/ 602 protected void prepArticle(Element articleContent) { 603 cleanStyles(articleContent); 604 killBreaks(articleContent); 605 606 /* Clean out junk from the article content */ 607 clean(articleContent, "form"); 608 clean(articleContent, "object"); 609 clean(articleContent, "h1"); 610 /** 611 * If there is only one h2, they are probably using it 612 * as a header and not a subheader, so remove it since we already have a header. 613 ***/ 614 if(articleContent.getElementsByTagName("h2").getLength() == 1) { 615 clean(articleContent, "h2"); 616 } 617 clean(articleContent, "iframe"); 618 619 cleanHeaders(articleContent); 620 621 /* Do these last as the previous stuff may have removed junk that will affect these */ 622 cleanConditionally(articleContent, "table"); 623 cleanConditionally(articleContent, "ul"); 624 cleanConditionally(articleContent, "div"); 625 626 /* Remove extra paragraphs */ 627 NodeList articleParagraphs = articleContent.getElementsByTagName("p"); 628 for(int i = articleParagraphs.getLength()-1; i >= 0; i--) 629 { 630 int imgCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("img").getLength(); 631 int embedCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("embed").getLength(); 632 int objectCount = ((Element) articleParagraphs.item(i)).getElementsByTagName("object").getLength(); 633 634 if(imgCount == 0 && embedCount == 0 && objectCount == 0 && getInnerText((Element) articleParagraphs.item(i), false) == "") 635 { 636 articleParagraphs.item(i).getParentNode().removeChild(articleParagraphs.item(i)); 637 } 638 } 639 640 //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, "<p"); 641 Node n = stringToNode(getInnerHTML(articleContent).replaceAll("(?i)<br[^>]*>\\s*<p", "<P")); 642 removeChildren(articleContent); 643 articleContent.appendChild(n); 644 645 //now remove empty p's and tidy up 646 NodeList nl = articleContent.getElementsByTagName("p"); 647 for (int i=nl.getLength()-1; i>=0; i--) { 648 if (nl.item(i).getTextContent().trim().length() == 0) 649 { 650 nl.item(i).getParentNode().removeChild(nl.item(i)); 651 } else if (nl.item(i).getChildNodes().getLength() == 1 && nl.item(i).getChildNodes().item(0).getNodeType() == Node.TEXT_NODE) { 652 nl.item(i).setTextContent("\n" + nl.item(i).getTextContent().trim() + "\n"); 653 } 654 else if (((Element) nl.item(i)).getAttribute("class").equals("readability-styled")) 655 { 656 nl.item(i).getParentNode().replaceChild(document.createTextNode(nl.item(i).getTextContent()), nl.item(i)); 657 } 658 } 659 660 } 661 662 protected void removeChildren(Node n) { 663 NodeList nl = n.getChildNodes(); 664 int nn = nl.getLength(); 665 for (int i=0; i<nn; i++) 666 n.removeChild(nl.item(0)); 667 } 668 669 /** 670 * Initialize a node with the readability object. Also checks the 671 * className/id for special names to add to its score. 672 * 673 * @param Element 674 **/ 675 protected void initializeNode(Element node) { 676 float contentScore = 0; 677 678 if (node.getTagName() == "DIV") { 679 contentScore += 5; 680 } else if (node.getTagName() == "PRE" || node.getTagName() == "TD" || node.getTagName() == "BLOCKQUOTE") { 681 contentScore += 3; 682 } else if (node.getTagName() == "ADDRESS" || node.getTagName() == "OL" || node.getTagName() == "UL" 683 || node.getTagName() == "DL" || node.getTagName() == "DD" || node.getTagName() == "DT" 684 || node.getTagName() == "LI" || node.getTagName() == "FORM") { 685 contentScore -= 3; 686 } else if (node.getTagName() == "H1" || node.getTagName() == "H2" || node.getTagName() == "H3" 687 || node.getTagName() == "H4" || node.getTagName() == "H5" || node.getTagName() == "H6" 688 || node.getTagName() == "TH") { 689 contentScore -= 5; 690 } 691 692 contentScore += getClassWeight(node); 693 node.setUserData("readability", contentScore, null); 694 } 695 696 /** 697 * Get an elements class/id weight. Uses regular expressions to tell if this 698 * element looks good or bad. 699 * 700 * @param Element 701 * @return number (Integer) 702 **/ 703 protected int getClassWeight(Element e) { 704 if (!flags.contains(Flag.FLAG_WEIGHT_CLASSES)) { 705 return 0; 706 } 707 708 int weight = 0; 709 710 /* Look for a special classname */ 711 if (e.getAttribute("class") != "") 712 { 713 if(search(e.getAttribute("class"), Regexps.negativeRe) != -1) { 714 weight -= 25; 715 } 716 717 if(search(e.getAttribute("class"), Regexps.positiveRe) != -1) { 718 weight += 25; 719 } 720 } 721 722 /* Look for a special ID */ 723 if (e.getAttribute("id") != "") 724 { 725 if(search(e.getAttribute("id"), Regexps.negativeRe) != -1) { 726 weight -= 25; 727 } 728 729 if(search(e.getAttribute("id"), Regexps.positiveRe) != -1) { 730 weight += 25; 731 } 732 } 733 734 return weight; 735 } 736 737 protected void cleanStyles() { 738 cleanStyles((Element) document); 739 } 740 741 /** 742 * Remove the style attribute on every e and under. 743 * TODO: Test if getElementsByTagName(*) is faster. 744 * 745 * @param Element 746 **/ 747 protected void cleanStyles(Element e) { 748 if(e == null) return; 749 Node cur = e.getFirstChild(); 750 751 // Remove any root styles, if we"re able. 752 if (!e.getAttribute("class").equals("readability-styled")) 753 e.removeAttribute("style"); 754 755 // Go until there are no more child nodes 756 while ( cur != null ) { 757 if ( cur.getNodeType() == Element.ELEMENT_NODE) { 758 // Remove style attribute(s) : 759 if(!((Element) cur).getAttribute("class").equals("readability-styled")) { 760 ((Element) cur).removeAttribute("style"); 761 } 762 cleanStyles( (Element) cur ); 763 } 764 cur = cur.getNextSibling(); 765 } 766 } 767 768 /** 769 * Remove extraneous break tags from a node. 770 * 771 * @param Element 772 **/ 773 protected void killBreaks(Element e) { 774 //e.innerHTML = e.innerHTML.replace(readability.regexps.killBreaksRe,"<br />"); 775 776 Node n = stringToNode(getInnerHTML(e).replaceAll(Regexps.killBreaksRe,"<BR />")); 777 removeChildren(e); 778 e.appendChild(n); 779 } 780 781 /** 782 * Clean a node of all elements of type "tag". 783 * (Unless it"s a youtube/vimeo video. People love movies.) 784 * 785 * @param Element 786 * @param string tag to clean 787 **/ 788 protected void clean(Element e, String tag) { 789 NodeList targetList = e.getElementsByTagName( tag ); 790 boolean isEmbed = (tag.equals("object") || tag.equals("embed")); 791 792 for (int y=targetList.getLength()-1; y >= 0; y--) { 793 /* Allow youtube and vimeo videos through as people usually want to see those. */ 794 if(isEmbed) { 795 String attributeValues = ""; 796 for (int i=0, il=targetList.item(y).getAttributes().getLength(); i < il; i++) { 797 attributeValues += targetList.item(y).getAttributes().item(i).getNodeValue() + "|"; 798 } 799 800 /* First, check the elements attributes to see if any of them contain youtube or vimeo */ 801 if (search(attributeValues, Regexps.videoRe) != -1) { 802 continue; 803 } 804 805 /* Then check the elements inside this element for the same. */ 806 if (search(getInnerHTML(targetList.item(y)), Regexps.videoRe) != -1) { 807 continue; 808 } 809 } 810 811 targetList.item(y).getParentNode().removeChild(targetList.item(y)); 812 } 813 } 814 815 /** 816 * Clean out spurious headers from an Element. Checks things like classnames and link density. 817 * 818 * @param Element 819 **/ 820 protected void cleanHeaders(Element e) { 821 for (int headerIndex = 1; headerIndex < 7; headerIndex++) { 822 NodeList headers = e.getElementsByTagName("h" + headerIndex); 823 for (int i=headers.getLength()-1; i >=0; i--) { 824 if (getClassWeight((Element) headers.item(i)) < 0 || getLinkDensity((Element) headers.item(i)) > LINK_DENSITY_THRESHOLD) { 825 headers.item(i).getParentNode().removeChild(headers.item(i)); 826 } 827 } 828 } 829 } 830 831 /** 832 * Get the density of links as a percentage of the content 833 * This is the amount of text that is inside a link divided by the total text in the node. 834 * 835 * @param Element 836 * @return number (float) 837 **/ 838 protected float getLinkDensity(Element e) { 839 NodeList links = e.getElementsByTagName("a"); 840 int textLength = getInnerText(e).length(); 841 int linkLength = 0; 842 843 for(int i=0, il=links.getLength(); i<il;i++) 844 { 845 linkLength += getInnerText((Element) links.item(i)).length(); 846 } 847 848 if (linkLength == 0) return 0; 849 850 return (float)linkLength / (float)textLength; 851 } 852 853 /** 854 * Clean an element of all tags of type "tag" if they look fishy. 855 * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. 856 **/ 857 protected void cleanConditionally(Element e, String tag) { 858 NodeList tagsList = e.getElementsByTagName(tag); 859 int curTagsLength = tagsList.getLength(); 860 861 /** 862 * Gather counts for other typical elements embedded within. 863 * Traverse backwards so we can remove nodes at the same time without effecting the traversal. 864 * 865 * Todo: Consider taking into account original contentScore here. 866 **/ 867 for (int i=curTagsLength-1; i >= 0; i--) { 868 int weight = getClassWeight((Element) tagsList.item(i)); 869 float contentScore = (tagsList.item(i).getUserData("readability") != null) ? (Float)(tagsList.item(i).getUserData("readability")) : 0; 870 871 dbg("Cleaning Conditionally " + tagsList.item(i) + " (" + ((Element) tagsList.item(i)).getAttribute("class")+ ":" + ((Element)tagsList.item(i)).getAttribute("id") + ")" + ((tagsList.item(i).getUserData("readability") != null) ? (" with score " + tagsList.item(i).getUserData("readability")) : "")); 872 873 if(weight+contentScore < 0) 874 { 875 dbg("Removing " + tagsList.item(i) + " (" + ((Element) tagsList.item(i)).getAttribute("class")+ ":" + ((Element)tagsList.item(i)).getAttribute("id") + ")"); 876 tagsList.item(i).getParentNode().removeChild(tagsList.item(i)); 877 } 878 else if ( getCharCount((Element) tagsList.item(i), ",") < 10) { 879 /** 880 * If there are not very many commas, and the number of 881 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. 882 **/ 883 int p = ((Element) tagsList.item(i)).getElementsByTagName("p").getLength(); 884 int img = ((Element) tagsList.item(i)).getElementsByTagName("img").getLength(); 885 int li = ((Element) tagsList.item(i)).getElementsByTagName("li").getLength()-100; 886 int input = ((Element) tagsList.item(i)).getElementsByTagName("input").getLength(); 887 888 int embedCount = 0; 889 NodeList embeds = ((Element) tagsList.item(i)).getElementsByTagName("embed"); 890 for(int ei=0,il=embeds.getLength(); ei < il; ei++) { 891 if (search(((Element)embeds.item(ei)).getAttribute("src"), Regexps.videoRe) == -1) { 892 embedCount++; 893 } 894 } 895 896 float linkDensity = getLinkDensity((Element) tagsList.item(i)); 897 int contentLength = getInnerText((Element) tagsList.item(i)).length(); 898 boolean toRemove = false; 899 900 if ( img > p ) { 901 toRemove = true; 902 } else if(li > p && tag != "ul" && tag != "ol") { 903 toRemove = true; 904 } else if( input > Math.floor(p/3) ) { 905 toRemove = true; 906 } else if(contentLength < 25 && (img == 0 || img > 2) ) { 907 toRemove = true; 908 } else if(weight < 25 && linkDensity > 0.2) { 909 toRemove = true; 910 } else if(weight >= 25 && linkDensity > 0.5) { 911 toRemove = true; 912 } else if((embedCount == 1 && contentLength < 75) || embedCount > 1) { 913 toRemove = true; 914 } 915 916 if ( img == 1 && p == 0 && contentLength == 0 ) { 917 Element theImg = (Element) ((Element) tagsList.item(i)).getElementsByTagName("img").item(0); 918 919 String w = ""; 920 if (theImg.getAttribute("width") != null) w = theImg.getAttribute("width"); 921 922 String h = ""; 923 if (theImg.getAttribute("height") != null) h = theImg.getAttribute("height"); 924 925 if (!(w.equals("0") || h.equals("0"))) 926 toRemove = false; //special case - it's just an inline image 927 } 928 929 if(toRemove) { 930 dbg("Removing " + tagsList.item(i) + " (" + ((Element) tagsList.item(i)).getAttribute("class")+ ":" + ((Element)tagsList.item(i)).getAttribute("id") + ")"); 931 tagsList.item(i).getParentNode().removeChild(tagsList.item(i)); 932 } 933 } 934 } 935 } 936 937 /** 938 * Get the number of times a string s appears in the node e. 939 * 940 * @param Element 941 * @param string - what to split on. Default is "," 942 * @return number (integer) 943 **/ 944 protected int getCharCount(Element e, String s) { 945 return getInnerText(e).split(s).length-1; 946 } 947 948 protected int getCharCount(Element e) { 949 return getCharCount(e, ","); 950 } 951 952 /** 953 * @return The article title 954 */ 955 public String getArticleTitle() { 956 return articleTitle; 957 } 958 959 /** 960 * @return The content type of the article 961 */ 962 public String getArticleContentType() { 963 return article_contentType; 964 } 965 966 /*** 967 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is 968 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. 969 * 970 * @return Element 971 **/ 972 protected Element grabArticle() { 973 boolean stripUnlikelyCandidates = flags.contains(Flag.FLAG_STRIP_UNLIKELYS); 974 975 /** 976 * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs 977 * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) 978 * 979 * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 980 * Todo: Shouldn't this be a reverse traversal? 981 **/ 982 Element node = null; 983 List<Element> nodesToScore = new ArrayList<Element>(); 984 for(int nodeIndex = 0; (node = (Element)document.getElementsByTagName("*").item(nodeIndex)) != null; nodeIndex++) 985 { 986 /* Remove unlikely candidates */ 987 if (stripUnlikelyCandidates) { 988 String unlikelyMatchString = node.getAttribute("class") + node.getAttribute("id"); 989 if (search(unlikelyMatchString, Regexps.unlikelyCandidatesRe) != -1 && 990 search(unlikelyMatchString, Regexps.okMaybeItsACandidateRe) == -1 && 991 !node.getTagName().equals("BODY")) 992 { 993 dbg("Removing unlikely candidate - " + unlikelyMatchString); 994 node.getParentNode().removeChild(node); 995 nodeIndex--; 996 continue; 997 } 998 } 999 1000 if (node.getTagName().equals("P") || node.getTagName().equals("TD")) { 1001 nodesToScore.add(node); 1002 } 1003 1004 /* Turn all divs that don't have children block level elements into p's */ 1005 if (node.getTagName().equals("DIV")) { 1006 1007 if (search(getInnerHTML(node), Regexps.divToPElementsRe) == -1) { 1008 dbg("Altering div to p"); 1009 Element newNode = document.createElement("P"); 1010 1011 //newNode.innerHTML = node.innerHTML; 1012 NodeList nl = node.getChildNodes(); 1013 for (int i=0; i<nl.getLength(); i++) newNode.appendChild(nl.item(i)); 1014 1015 node.getParentNode().replaceChild(newNode, node); 1016 nodeIndex--; 1017 } 1018 else 1019 { 1020 /* EXPERIMENTAL */ 1021 for(int i = 0, il = node.getChildNodes().getLength(); i < il; i++) { 1022 Node childNode = node.getChildNodes().item(i); 1023 if(childNode.getNodeType() == Element.TEXT_NODE) { 1024 dbg("replacing text node with a p tag with the same content."); 1025 Element p = document.createElement("p"); 1026 //p.innerHTML = childNode.nodeValue; 1027 p.setNodeValue(childNode.getNodeValue()); 1028 p.setTextContent(childNode.getTextContent()); 1029 //p.style.display = "inline"; 1030 p.setAttribute("class", "readability-styled"); 1031 childNode.getParentNode().replaceChild(p, childNode); 1032 } 1033 } 1034 } 1035 } 1036 } 1037 1038 /** 1039 * Loop through all paragraphs, and assign a score to them based on how content-y they look. 1040 * Then add their score to their parent node. 1041 * 1042 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. 1043 **/ 1044 List<Element> candidates = new ArrayList<Element>(); 1045 for (int pt=0; pt < nodesToScore.size(); pt++) { 1046 Element parentNode = (Element) nodesToScore.get(pt).getParentNode(); 1047 Element grandParentNode = (Element) parentNode.getParentNode(); 1048 String innerText = getInnerText(nodesToScore.get(pt)); 1049 1050 /* If this paragraph is less than 25 characters, don't even count it. */ 1051 if(innerText.length() < 25) { 1052 continue; 1053 } 1054 1055 /* Initialize readability data for the parent. */ 1056 if(parentNode.getUserData("readability") == null) 1057 { 1058 initializeNode(parentNode); 1059 candidates.add(parentNode); 1060 } 1061 1062 /* Initialize readability data for the grandparent. */ 1063 if(grandParentNode.getUserData("readability") == null) 1064 { 1065 initializeNode(grandParentNode); 1066 candidates.add(grandParentNode); 1067 } 1068 1069 float contentScore = 0; 1070 1071 /* Add a point for the paragraph itself as a base. */ 1072 contentScore++; 1073 1074 /* Add points for any commas within this paragraph */ 1075 contentScore += innerText.split(",").length; 1076 1077 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ 1078 contentScore += Math.min(Math.floor((float)innerText.length() / 100F), 3F); 1079 1080 /* Add the score to the parent. The grandparent gets half. */ 1081 parentNode.setUserData("readability", ((Float)(parentNode.getUserData("readability")) + contentScore), null); 1082 grandParentNode.setUserData("readability", ((Float)(grandParentNode.getUserData("readability"))) + (contentScore/2F), null); 1083 } 1084 1085 /** 1086 * After we've calculated scores, loop through all of the possible candidate nodes we found 1087 * and find the one with the highest score. 1088 **/ 1089 Element topCandidate = null; 1090 for(int c=0, cl=candidates.size(); c < cl; c++) 1091 { 1092 /** 1093 * Scale the final candidates score based on link density. Good content should have a 1094 * relatively small link density (5% or less) and be mostly unaffected by this operation. 1095 **/ 1096 1097 candidates.get(c).setUserData("readability", (Float)(candidates.get(c).getUserData("readability")) * (1F-getLinkDensity(candidates.get(c))), null); 1098 1099 dbg("Candidate: " + candidates.get(c) + " (" + candidates.get(c).getAttribute("class")+ ":" + candidates.get(c).getAttribute("id")+ ") with score " + candidates.get(c).getUserData("readability")); 1100 1101 if(topCandidate == null || (Float)(candidates.get(c).getUserData("readability")) > ((Float)topCandidate.getUserData("readability"))) { 1102 topCandidate = candidates.get(c); 1103 } 1104 } 1105 1106 if (topCandidate != null) 1107 dbg("==> TOP Candidate: " + topCandidate + " (" + topCandidate.getAttribute("class")+ ":" + topCandidate.getAttribute("id")+ ") with score " + topCandidate.getUserData("readability")); 1108 1109 /** 1110 * If we still have no top candidate, just use the body as a last resort. 1111 * We also have to copy the body node so it is something we can modify. 1112 **/ 1113 if (topCandidate == null || topCandidate.getTagName().equals("BODY")) 1114 { 1115 topCandidate = document.createElement("DIV"); 1116 1117 //topCandidate.innerHTML = document.body.innerHTML; 1118 NodeList nl = getBody().getChildNodes(); 1119 for (int i=0; i<nl.getLength(); i++) topCandidate.appendChild(nl.item(i)); 1120 //document.body.innerHTML = ""; //should be covered by above 1121 1122 1123 getBody().appendChild(topCandidate); 1124 initializeNode(topCandidate); 1125 } 1126 1127 /** 1128 * Now that we have the top candidate, look through its siblings for content that might also be related. 1129 * Things like preambles, content split by ads that we removed, etc. 1130 **/ 1131 Element articleContent = document.createElement("DIV"); 1132 articleContent.setAttribute("id", "readability-content"); 1133 float siblingScoreThreshold = (float) Math.max(10F, (Float)topCandidate.getUserData("readability") * 0.2F); 1134 NodeList siblingNodes = topCandidate.getParentNode().getChildNodes(); 1135 1136 for(int s=0, sl=siblingNodes.getLength(); s < sl; s++) 1137 { 1138 Node siblingNode = siblingNodes.item(s); 1139 boolean append = false; 1140 1141 if (siblingNode instanceof Element) 1142 dbg("Looking at sibling node: " + siblingNode + " (" + ((Element) siblingNode).getAttribute("class") + ":" + ((Element) siblingNode).getAttribute("id") + ")" + ((siblingNode.getUserData("readability") != null) ? (" with score " + siblingNode.getUserData("readability")) : "")); 1143 dbg("Sibling has score " + (siblingNode.getUserData("readability") != null ? siblingNode.getUserData("readability") : "Unknown")); 1144 1145 if(siblingNode == topCandidate) 1146 { 1147 append = true; 1148 } 1149 1150 float contentBonus = 0; 1151 /* Give a bonus if sibling nodes and top candidates have the example same classname */ 1152 if(siblingNode instanceof Element && ((Element) siblingNode).getAttribute("class").equals(topCandidate.getAttribute("class")) && !topCandidate.getAttribute("class").equals("")) { 1153 contentBonus += (Float)topCandidate.getUserData("readability") * 0.2F; 1154 } 1155 1156 if(siblingNode.getUserData("readability") != null && ((Float)siblingNode.getUserData("readability")+contentBonus) >= siblingScoreThreshold) 1157 { 1158 append = true; 1159 } 1160 1161 if(siblingNode.getNodeName().equals("P")) { 1162 float linkDensity = getLinkDensity((Element) siblingNode); 1163 String nodeContent = getInnerText((Element) siblingNode); 1164 int nodeLength = nodeContent.length(); 1165 1166 if(nodeLength > 80 && linkDensity < 0.25) 1167 { 1168 append = true; 1169 } 1170 else if(nodeLength < 80 && linkDensity == 0 && search(nodeContent, "\\.( |$)") != -1) 1171 { 1172 append = true; 1173 } 1174 } 1175 1176 if(append) 1177 { 1178 dbg("Appending node: " + siblingNode); 1179 1180 Node nodeToAppend = null; 1181 if(!siblingNode.getNodeName().equals("DIV") && !siblingNode.getNodeName().equals("P")) { 1182 /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ 1183 1184 dbg("Altering siblingNode of " + siblingNode.getNodeName() + " to div."); 1185 nodeToAppend = document.createElement("div"); 1186 if (siblingNode instanceof Element) 1187 ((Element) nodeToAppend).setAttribute("id", ((Element) siblingNode).getAttribute("id")); 1188 1189 //nodeToAppend.innerHTML = siblingNode.innerHTML; 1190 NodeList nl = siblingNode.getChildNodes(); 1191 for (int i=0; i<nl.getLength(); i++) nodeToAppend.appendChild(nl.item(i)); 1192 } else { 1193 nodeToAppend = siblingNode; 1194 s--; 1195 sl--; 1196 } 1197 1198 /* To ensure a node does not interfere with readability styles, remove its classnames */ 1199 if (nodeToAppend instanceof Element) 1200 ((Element) nodeToAppend).setAttribute("class", ""); 1201 1202 /* Append sibling and subtract from our list because it removes the node when you append to another node */ 1203 articleContent.appendChild(nodeToAppend); 1204 } 1205 } 1206 1207 /** 1208 * So we have all of the content that we need. Now we clean it up for presentation. 1209 **/ 1210 prepArticle(articleContent); 1211 1212 return articleContent; 1213 } 1214 1215 protected String getInnerHTML(Node n) { 1216 if (n.getNodeType() == Node.TEXT_NODE) return n.getTextContent(); 1217 1218 String result = ""; 1219 NodeList nl = n.getChildNodes(); 1220 for (int i=0; i<nl.getLength(); i++) { 1221 if (nl.item(i).getNodeType() == Node.TEXT_NODE) 1222 result += nl.item(i).getTextContent(); 1223 else if (nl.item(i).getNodeType() == Node.COMMENT_NODE) 1224 result += "<!-- " + nl.item(i).getTextContent() + " -->"; 1225 else 1226 result += nodeToString(nl.item(i)); 1227 } 1228 1229 return result; 1230 } 1231 1232 protected String nodeToString(Node n) { 1233 return nodeToString(n, false); 1234 } 1235 1236 protected String nodeToString(Node n, boolean pretty) { 1237 try { 1238 DOMImplementationRegistry registry = DOMImplementationRegistry.newInstance(); 1239 DOMImplementationLS impl = (DOMImplementationLS)registry.getDOMImplementation("LS"); 1240 LSSerializer writer = impl.createLSSerializer(); 1241 1242 writer.getDomConfig().setParameter("xml-declaration", false); 1243 if (pretty) { 1244 writer.getDomConfig().setParameter("format-pretty-print", true); 1245 } 1246 1247 return writer.writeToString(n); 1248 } catch (Exception e) { 1249 throw new RuntimeException(e); 1250 } 1251 } 1252 1253 protected Node stringToNode(String str) { 1254 try { 1255 DOMFragmentParser parser = new DOMFragmentParser(); 1256 DocumentFragment fragment = document.createDocumentFragment(); 1257 parser.parse(new InputSource(new StringReader(str)), fragment); 1258 return fragment; 1259 1260 //try and return the element itself if possible... 1261 // NodeList nl = fragment.getChildNodes(); 1262 // for (int i=0; i<nl.getLength(); i++) if (nl.item(i).getNodeType() == Node.ELEMENT_NODE) return nl.item(i); 1263 // return fragment; 1264 1265 } catch (Exception e) { 1266 throw new RuntimeException(e); 1267 } 1268 } 1269 1270 /** 1271 * Get the inner text of a node - cross browser compatibly. 1272 * This also strips out any excess whitespace to be found. 1273 * 1274 * @param Element 1275 * @return string 1276 **/ 1277 protected String getInnerText(Element e, boolean normalizeSpaces) { 1278 String textContent = ""; 1279 1280 textContent = e.getTextContent().replaceAll( Regexps.trimRe, "" ); 1281 1282 if(normalizeSpaces) { 1283 return textContent.replaceAll( Regexps.normalizeRe, " "); 1284 } else { 1285 return textContent; 1286 } 1287 } 1288 1289 protected String getInnerTextSep(Node e) { 1290 if (e.hasChildNodes()) { 1291 String s = ""; 1292 NodeList nl = e.getChildNodes(); 1293 for (int i=0; i<nl.getLength(); i++) { 1294 if (!nl.item(i).getNodeName().equalsIgnoreCase("script")) 1295 s += getInnerTextSep(nl.item(i)); 1296 } 1297 return s; 1298 } else { 1299 return e.getTextContent() + " "; 1300 } 1301 } 1302 1303 protected String getInnerText(Element e) { 1304 return getInnerText(e, true); 1305 } 1306 1307 /** 1308 * @return The article HTML content as a {@link String}. 1309 */ 1310 public String getArticleHTML() { 1311 if (articleContent == null) return ""; 1312 return nodeToString(articleContent, true); 1313 } 1314 1315 /** 1316 * @return The articles HTML dom node. 1317 */ 1318 public Node getArticleHTML_DOM() { 1319 return articleContent; 1320 } 1321 1322 protected String getArticleDateString() { 1323 return article_date_string; 1324 } 1325 1326 /** 1327 * @return The article date. 1328 */ 1329 public Date getArticleDate() { 1330 return article_date; 1331 } 1332 1333 /** 1334 * @return The text of the article. 1335 */ 1336 public String getArticleText() { 1337 if (articleContent == null) return "Unable to find article content"; 1338 //return getInnerText(articleContent, false); 1339 return articleContent.getTextContent().trim().replaceAll("[\r|\n|\r\n]{2,}", "\n\n").replaceAll(" {2,}", " "); 1340 } 1341 1342 /** 1343 * @return Any links in the article. 1344 */ 1345 public List<Anchor> getArticleLinks() { 1346 List<Anchor> anchors = new ArrayList<Anchor>(); 1347 if (articleContent == null) return anchors; 1348 1349 NodeList nl = articleContent.getElementsByTagName("a"); 1350 for (int i=0; i<nl.getLength(); i++) { 1351 Element a = (Element) nl.item(i); 1352 1353 Anchor anchor = new Anchor(getInnerText(a), a.getAttribute("href")); 1354 anchors.add(anchor); 1355 } 1356 return anchors; 1357 } 1358 1359 /** 1360 * @return Any links in the document. 1361 */ 1362 public List<Anchor> getAllLinks() { 1363 List<Anchor> anchors = new ArrayList<Anchor>(); 1364 1365 NodeList nl = document.getElementsByTagName("a"); 1366 for (int i=0; i<nl.getLength(); i++) { 1367 Element a = (Element) nl.item(i); 1368 Anchor anchor = new Anchor(getInnerText(a), a.getAttribute("href")); 1369 anchors.add(anchor); 1370 } 1371 return anchors; 1372 } 1373 1374 /** 1375 * @return Any images in the article. 1376 */ 1377 public List<String> getArticleImages() { 1378 List<String> images = new ArrayList<String>(); 1379 if (articleContent == null) return images; 1380 1381 NodeList nl = articleContent.getElementsByTagName("img"); 1382 for (int i=0; i<nl.getLength(); i++) { 1383 Element img = (Element) nl.item(i); 1384 images.add(img.getAttribute("src")); 1385 } 1386 return images; 1387 } 1388 1389 /** 1390 * @return Any subheadings in the article. 1391 */ 1392 public List<String> getArticleSubheadings() { 1393 List<String> subtitles = new ArrayList<String>(); 1394 if (articleContent == null) return subtitles; 1395 1396 for (int j=1; j<=6; j++) { 1397 NodeList nl = articleContent.getElementsByTagName("h"+j); 1398 if (nl.getLength() > 0) { 1399 for (int i=0; i<nl.getLength(); i++) { 1400 subtitles.add(nl.item(i).getTextContent()); 1401 } 1402 break; 1403 } 1404 } 1405 1406 if (subtitles.size() == 0) { 1407 //try looking for other likely-looking elements 1408 1409 NodeList nl = articleContent.getElementsByTagName("*"); 1410 for (int i=0; i<nl.getLength(); i++) { 1411 if (nl.item(i) instanceof Element && 1412 ((Element) nl.item(i)).getAttribute("class") != null && 1413 search(((Element) nl.item(i)).getAttribute("class"), Regexps.likelySubheadCandidateRe) != -1) 1414 subtitles.add(nl.item(i).getTextContent()); 1415 } 1416 } 1417 1418 return subtitles; 1419 } 1420 1421 protected List<Node> findChildNodesWithName(Node parent, String name) { 1422 NodeList children = parent.getChildNodes(); 1423 List<Node> results = new ArrayList<Node>(); 1424 1425 for (int i = 0; i < children.getLength(); ++i) { 1426 Node child = children.item(i); 1427 if (child == null) 1428 continue; 1429 1430 String nodeName = child.getNodeName(); 1431 if (nodeName == null) 1432 continue; 1433 1434 if (nodeName.equals(name)) { 1435 results.add(child); 1436 } 1437 } 1438 return results; 1439 } 1440 1441 protected int findChildNodeIndex( Node parent, Node childToFind ) 1442 { 1443 for( int index = 0; index < parent.getChildNodes().getLength(); index++ ) 1444 if( parent.getChildNodes().item( index ) == childToFind ) 1445 return index; 1446 return -1; 1447 } 1448 1449 protected void getArticleTextMapping(TreeWalker walker, List<MappingNode> map) throws DOMException { 1450 Node parend = walker.getCurrentNode(); 1451 1452 if( parend.getNodeType() == Node.TEXT_NODE && parend.getParentNode().getAttributes().getNamedItem("id") != null ) 1453 { 1454 if( parend.getTextContent().trim().length() > 0 ) 1455 { 1456 int index = findChildNodeIndex( parend.getParentNode(), parend ); 1457 if( index != -1 ) 1458 { 1459 // square brackets are not valid XML/HTML identifier characters, so we can use them here 1460 map.add( new MappingNode( 1461 parend.getParentNode().getAttributes().getNamedItem("id").getNodeValue() + "["+index+"]", 1462 parend.getNodeValue() ) ); 1463 1464// System.out.println( "ELEMENT '"+parend.getParentNode().getAttributes().getNamedItem("id").getNodeValue() + "["+index+"]"+"'"); 1465// System.out.println( "VALUE: '"+parend.getNodeValue()+"'" ); 1466 } 1467 } 1468 } 1469 1470 // traverse children: 1471 for (Node n = walker.firstChild(); n != null; n = walker.nextSibling()) { 1472 getArticleTextMapping(walker, map); 1473 } 1474 1475 // return position to the current (level up): 1476 walker.setCurrentNode(parend); 1477 } 1478 1479 protected class MappingNode { 1480 String id; 1481 String text; 1482 1483 public MappingNode(String id, String text) { this.id = id; this.text = text; } 1484 public String getId() { return id; } 1485 public String getText() { return text; } 1486 @Override public String toString() { return "MappingNode(" + id + " -> " + text + ")"; } 1487 } 1488 1489 /** 1490 * Get the mapping between bits of text in the dom & their xpaths 1491 * 1492 * @return mapping from xpath to text 1493 */ 1494 public List<MappingNode> getArticleTextMapping() { 1495 if (articleContent == null) return null; 1496 1497 List<MappingNode> map = new ArrayList<MappingNode>(); 1498 1499 TreeWalker walker = ((DocumentTraversal) document).createTreeWalker(articleContent, NodeFilter.SHOW_TEXT | NodeFilter.SHOW_ELEMENT, null, true); 1500 1501 getArticleTextMapping(walker, map); 1502 1503 return map; 1504 } 1505 1506 /** 1507 * Convenience method to build a {@link Readability} instance from an html string. 1508 * @param html The html string 1509 * @return new {@link Readability} instance. 1510 * @throws SAXException 1511 * @throws IOException 1512 */ 1513 public static Readability getReadability(String html) throws SAXException, IOException { 1514 return getReadability( html, false ); 1515 } 1516 1517 /** 1518 * Convenience method to build a {@link Readability} instance from an html string. 1519 * @param html The html string 1520 * @param addTitle Should the title be added to the generated article? 1521 * @return new {@link Readability} instance. 1522 * @throws SAXException 1523 * @throws IOException 1524 */ 1525 public static Readability getReadability(String html, boolean addTitle) throws SAXException, IOException { 1526 DOMParser parser = new DOMParser(); 1527 parser.parse(new InputSource(new StringReader(html))); 1528 1529 return new Readability(parser.getDocument(), false, addTitle ); 1530 } 1531 1532 /** 1533 * Testing 1534 * @param argv 1535 * @throws Exception 1536 */ 1537 public static void main(String[] argv) throws Exception { 1538// URL input = new URL("file:///home/dd/Programming/Readability4J/t.html"); 1539 URL input = new URL("http://news.bbc.co.uk/1/hi/politics/10362367.stm"); 1540 // URL input = new URL("http://euobserver.com/9/30465"); 1541 // URL input = new URL("http://euobserver.com/?aid=23383"); 1542 // URL input = new URL("http://abandoninplace.squarespace.com/blog/2010/6/8/wwdc-monday.html"); 1543 // URL input = new URL("file:///Users/jsh2/Desktop/test.html"); 1544 // URL input = new URL("http://mobile.engadget.com/2010/06/17/htc-aria-review/"); 1545 // URL input = new URL("http://thedailywtf.com/Articles/Benched.aspx"); 1546 // URL input = new URL("http://www.dailymail.co.uk/news/article-1287625/Woman-sparked-150-000-manhunt-slashing-face-crying-rape-faces-jail.html"); 1547 //URL input = new URL("http://mrpaparazzi.com/post/11619/Lindsay-Lohan-Tests-Negative-For-Alcohol-Goes-Clubbing-To-Celebrate.aspx"); 1548 //URL input = new URL("http://www.bbc.co.uk/news/world-middle-east-11415719"); 1549 //URL input = new URL("http://www.thebigproject.co.uk/news/"); 1550// URL input = new URL("http://blogs.euobserver.com/popescu/2009/12/15/on-euro-optimism-pessimism-and-failures/#more-958"); 1551 //URL input = new URL("http://www.cnn.com/2010/WORLD/meast/09/27/west.bank.settlement.construction/index.html?hpt=T2"); 1552 1553 //URL input = new URL("http://www.huffingtonpost.com/steven-cohen/its-time-to-enact-congest_b_740315.html"); 1554 // URL input = new URL("http://uk.mac.ign.com/articles/573/573319p1.html"); 1555 DOMParser parser = new DOMParser(); 1556 parser.parse(new InputSource(input.openStream())); 1557 1558 Readability r = new Readability(parser.getDocument(), false, true); 1559 1560 //System.out.println(r.getArticleTitle()); 1561// System.out.println(r.getArticleHTML()); 1562 //System.out.println(r.getAllLinks()); 1563 System.out.println(r.getArticleText()); 1564 1565 System.out.println(); 1566 System.out.println("***"); 1567 System.out.println(); 1568 1569 for (MappingNode s : r.getArticleTextMapping()) 1570 System.out.println(s); 1571 1572 //PrintStream out = new PrintStream("news-sites"); 1573 //for (Anchor anchor : r.getAllLinks()) { 1574 // out.println(anchor.getHref() + "\t" + anchor.getText()); 1575 //} 1576 //out.close(); 1577 1578 //System.out.println(r.getArticleImages()); 1579 // System.out.println(r.getArticleSubheadings()); 1580 // System.out.println(r.getArticleHTML()); 1581 // System.out.println(r.getArticleHTML_DOM()); 1582 1583 //System.out.println(r.getArticleDateString()); 1584 //System.out.println(r.getArticleDate()); 1585 } 1586}