001package org.openimaj.picslurper; 002 003import java.io.ByteArrayInputStream; 004import java.io.File; 005import java.io.IOException; 006import java.net.MalformedURLException; 007import java.net.URL; 008import java.util.ArrayList; 009import java.util.Arrays; 010import java.util.HashSet; 011import java.util.List; 012import java.util.Set; 013import java.util.regex.Matcher; 014import java.util.regex.Pattern; 015 016import org.apache.commons.io.FileUtils; 017import org.apache.http.HttpEntity; 018import org.apache.http.HttpRequest; 019import org.apache.http.HttpResponse; 020import org.apache.http.ProtocolException; 021import org.apache.http.protocol.HttpContext; 022import org.apache.log4j.Logger; 023import org.openimaj.image.ImageUtilities; 024import org.openimaj.image.MBFImage; 025import org.openimaj.io.HttpUtils; 026import org.openimaj.io.HttpUtils.MetaRefreshRedirectStrategy; 027import org.openimaj.picslurper.output.OutputListener; 028import org.openimaj.picslurper.output.WriteableImageOutput; 029import org.openimaj.text.nlp.patterns.URLPatternProvider; 030import org.openimaj.twitter.collection.StreamJSONStatusList.ReadableWritableJSON; 031import org.openimaj.util.pair.IndependentPair; 032import org.openimaj.web.scraping.SiteSpecificConsumer; 033import org.openimaj.web.scraping.images.CommonHTMLConsumers; 034import org.openimaj.web.scraping.images.FacebookConsumer; 035import org.openimaj.web.scraping.images.ImgurConsumer; 036import org.openimaj.web.scraping.images.InstagramConsumer; 037import org.openimaj.web.scraping.images.OwlyImageConsumer; 038import org.openimaj.web.scraping.images.TmblrPhotoConsumer; 039import org.openimaj.web.scraping.images.TwipleConsumer; 040import org.openimaj.web.scraping.images.TwitPicConsumer; 041import org.openimaj.web.scraping.images.TwitterPhotoConsumer; 042import org.openimaj.web.scraping.images.YfrogConsumer; 043 044import twitter4j.Status; 045import twitter4j.URLEntity; 046 047/** 048 * A status consumer knows how to consume a {@link ReadableWritableJSON} and 049 * output image files. Currently this {@link StatusConsumer} only understands 050 * Twitter JSON, perhaps making it abstract and turning {@link #consume(Status)} 051 * into an abstract function that can deal with other types of status would be 052 * sensible 053 * 054 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 055 * 056 */ 057public class StatusConsumer { 058 059 /** 060 * The logger 061 */ 062 public static Logger logger = Logger.getLogger(StatusConsumer.class); 063 064 final static Pattern urlPattern = new URLPatternProvider().pattern(); 065 /** 066 * the site specific consumers 067 */ 068 public final static List<SiteSpecificConsumer> siteSpecific = new ArrayList<SiteSpecificConsumer>(); 069 static { 070 StatusConsumer.siteSpecific.add(new InstagramConsumer()); 071 StatusConsumer.siteSpecific.add(new TwitterPhotoConsumer()); 072 // StatusConsumer.siteSpecific.add(new TmblrPhotoConsumer()); 073 StatusConsumer.siteSpecific.add(new TwitPicConsumer()); 074 StatusConsumer.siteSpecific.add(new ImgurConsumer()); 075 StatusConsumer.siteSpecific.add(new FacebookConsumer()); 076 StatusConsumer.siteSpecific.add(new YfrogConsumer()); 077 StatusConsumer.siteSpecific.add(new OwlyImageConsumer()); 078 StatusConsumer.siteSpecific.add(new TwipleConsumer()); 079 StatusConsumer.siteSpecific.add(CommonHTMLConsumers.FOTOLOG); 080 StatusConsumer.siteSpecific.add(CommonHTMLConsumers.PHOTONUI); 081 StatusConsumer.siteSpecific.add(CommonHTMLConsumers.PICS_LOCKERZ); 082 } 083 private boolean outputStats; 084 private File globalStats; 085 private File outputLocation; 086 087 private final Set<String> toProcess; 088 089 private final HashSet<String> previouslySeen; 090 091 private List<OutputListener> outputModes; 092 093 /** 094 * @param outputStats 095 * whether statistics should be outputted 096 * @param globalStats 097 * the global statistics file 098 * @param outputLocation 099 * the output location for this status 100 * @param outputModes 101 * the output modes informed on image downloads 102 * 103 */ 104 public StatusConsumer(final boolean outputStats, final File globalStats, final File outputLocation, 105 final List<OutputListener> outputModes) 106 { 107 this(); 108 this.outputStats = outputStats; 109 this.globalStats = globalStats; 110 this.outputLocation = outputLocation; 111 this.outputModes = outputModes; 112 113 } 114 115 /** 116 * for convenience 117 */ 118 public StatusConsumer() { 119 this.previouslySeen = new HashSet<String>(); 120 this.toProcess = new HashSet<String>(); 121 } 122 123 class LoggingStatus { 124 List<String> strings = new ArrayList<String>(); 125 } 126 127 /** 128 * @param status 129 * @return the statistics of the consumption 130 * @throws Exception 131 */ 132 public StatusConsumption consume(final Status status) throws Exception { 133 StatusConsumption cons; 134 // Now add all the entries from entities.urls 135 136 if (status.getURLEntities() != null) { 137 138 for (final URLEntity map : status.getURLEntities()) { 139 String u = map.getExpandedURL(); 140 if (u == null) { 141 u = map.getURL(); 142 } 143 if (u == null) 144 continue; 145 final String eurl = u.toString(); 146 if (eurl == null) 147 continue; 148 this.add(eurl); 149 } 150 } 151 // Find the URLs in the raw text 152 final String text = status.getText(); 153 if (text != null) { // why was text null? 154 final Matcher matcher = StatusConsumer.urlPattern.matcher(text); 155 while (matcher.find()) { 156 final String urlString = text.substring(matcher.start(), matcher.end()); 157 this.add(urlString); 158 } 159 } 160 161 // now go through all the links and process them (i.e. download them) 162 cons = this.processAll(status); 163 164 if (this.outputStats) 165 PicSlurperUtils.updateStats(this.globalStats, cons, true); 166 return cons; 167 } 168 169 /** 170 * Process all added URLs 171 * 172 * @param status 173 * @return the {@link StatusConsumption} statistics 174 * @throws IOException 175 */ 176 public StatusConsumption processAll(final Status status) throws IOException { 177 final StatusConsumption cons = new StatusConsumption(); 178 cons.nTweets = 1; 179 cons.nURLs = 0; 180 while (this.toProcess.size() > 0) { 181 final String url = this.toProcess.iterator().next(); 182 this.toProcess.remove(url); 183 cons.nURLs++; 184 final File urlOut = this.resolveURL(new URL(url), cons); 185 if (urlOut != null) { 186 final File outStats = new File(urlOut, "status.txt"); 187 PicSlurperUtils.updateStats(outStats, cons); 188 PicSlurperUtils.updateTweets(urlOut, status); 189 for (final OutputListener outputMode : this.outputModes) { 190 outputMode.newImageDownloaded(new WriteableImageOutput(status, new URL(url), urlOut, cons)); 191 } 192 } 193 194 } 195 return cons; 196 } 197 198 /** 199 * Add a URL to process without allowing already seen URLs to be added 200 * 201 * @param newURL 202 */ 203 public void add(final String newURL) { 204 boolean add = true; 205 for (final String string : this.previouslySeen) { 206 if (string.startsWith(newURL) || newURL.startsWith(string) || newURL.equals(string)) { 207 add = false; 208 break; 209 } 210 } 211 if (add) { 212 StatusConsumer.logger.debug("New URL added to list: " + newURL); 213 this.toProcess.add(newURL); 214 this.previouslySeen.add(newURL); 215 } else { 216 StatusConsumer.logger.debug("URL not added, already exists: " + newURL); 217 } 218 } 219 220 /** 221 * Given a URL, use {@link #urlToImage(URL)} to turn the url into a list of 222 * images and write the images into the output location using the names 223 * "image_N.png" 224 * 225 * @param url 226 * @param cons 227 * the consumption stats 228 * @return the root output location 229 */ 230 public File resolveURL(final URL url, final StatusConsumption cons) { 231 final List<IndependentPair<URL, MBFImage>> image = this.urlToImage(url); 232 if (image == null) 233 return null; 234 File outputDir; 235 try { 236 if (this.outputLocation == null) 237 return null; 238 outputDir = StatusConsumer.urlToOutput(url, this.outputLocation); 239 cons.nTweets++; 240 int n = 0; 241 for (final IndependentPair<URL, MBFImage> mbfImage : image) { 242 final URL urlReadFrom = mbfImage.firstObject(); 243 final MBFImage imageToWrite = mbfImage.secondObject(); 244 File outImage = null; 245 if (imageToWrite == null) { 246 StatusConsumer.logger.debug("Downloading a raw GIF"); 247 // For now this is the signal that we have a GIF. Write the 248 // gif. 249 outImage = new File(outputDir, String.format("image_%d.gif", n++)); 250 final byte[] value = HttpUtils.readURLAsBytes(urlReadFrom, false); 251 FileUtils.writeByteArrayToFile(outImage, value); 252 } else { 253 StatusConsumer.logger.debug("Downloading a normal image"); 254 outImage = new File(outputDir, String.format("image_%d.png", n++)); 255 ImageUtilities.write(imageToWrite, outImage); 256 } 257 cons.nImages++; 258 cons.imageURLs.add(urlReadFrom); 259 } 260 return outputDir; 261 } catch (final IOException e) { 262 e.printStackTrace(); 263 } 264 return null; 265 266 } 267 268 /** 269 * An extention of the {@link MetaRefreshRedirectStrategy} which disallows 270 * all redirects and instead remembers a redirect for use later on. 271 * 272 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 273 * 274 */ 275 public static class StatusConsumerRedirectStrategy extends MetaRefreshRedirectStrategy { 276 private boolean wasRedirected = false; 277 private URL redirection; 278 279 @Override 280 public boolean isRedirected(final HttpRequest request, final HttpResponse response, final HttpContext context) 281 throws ProtocolException 282 { 283 this.wasRedirected = super.isRedirected(request, response, context); 284 285 if (this.wasRedirected) { 286 try { 287 this.redirection = this.getRedirect(request, response, context).getURI().toURL(); 288 } catch (final MalformedURLException e) { 289 this.wasRedirected = false; 290 } 291 } 292 return false; 293 } 294 295 /** 296 * @return whether a redirect was found 297 */ 298 public boolean wasRedirected() { 299 return this.wasRedirected; 300 } 301 302 /** 303 * @return the redirection 304 */ 305 public URL redirection() { 306 return this.redirection; 307 } 308 } 309 310 /** 311 * First, try all the {@link SiteSpecificConsumer} instances loaded into 312 * {@link #siteSpecific}. If any consumer takes control of a link the 313 * consumer's output is used 314 * 315 * if this fails use 316 * {@link HttpUtils#readURLAsByteArrayInputStream(URL, org.apache.http.client.RedirectStrategy)} 317 * with a {@link StatusConsumerRedirectStrategy} which specifically 318 * disallows redirects to be dealt with automatically and forces this 319 * function to be called for each redirect. 320 * 321 * 322 * @param url 323 * @return a list of images or null 324 */ 325 @SuppressWarnings("unchecked") 326 public List<IndependentPair<URL, MBFImage>> urlToImage(final URL url) { 327 StatusConsumer.logger.debug("Resolving URL: " + url); 328 StatusConsumer.logger.debug("Attempting site specific consumers"); 329 List<IndependentPair<URL, MBFImage>> image = null; 330 for (final SiteSpecificConsumer consumer : StatusConsumer.siteSpecific) { 331 if (consumer.canConsume(url)) { 332 StatusConsumer.logger.debug("Site specific consumer: " + consumer.getClass().getName() 333 + " working on link"); 334 final List<URL> urlList = consumer.consume(url); 335 if (urlList != null && !urlList.isEmpty()) { 336 StatusConsumer.logger.debug("Site specific consumer returned non-null, adding the URLs"); 337 for (final URL siteSpecific : urlList) { 338 this.add(siteSpecific.toString()); 339 } 340 return image; 341 } 342 } 343 } 344 try { 345 StatusConsumer.logger.debug("Site specific consumers failed, trying the raw link"); 346 final StatusConsumerRedirectStrategy redirector = new StatusConsumerRedirectStrategy(); 347 final IndependentPair<HttpEntity, ByteArrayInputStream> headersBais = HttpUtils 348 .readURLAsByteArrayInputStream(url, 1000, 1000, redirector, HttpUtils.DEFAULT_USERAGENT); 349 if (redirector.wasRedirected()) { 350 StatusConsumer.logger.debug("Redirect intercepted, adding redirection to list"); 351 final String redirect = redirector.redirection().toString(); 352 if (!redirect.equals(url.toString())) 353 this.add(redirect); 354 return null; 355 } 356 final HttpEntity headers = headersBais.firstObject(); 357 final ByteArrayInputStream bais = headersBais.getSecondObject(); 358 final String typeValue = headers.getContentType().getValue(); 359 if (typeValue.contains("text")) { 360 this.reportFailedURL(url, "text content"); 361 return null; 362 } else { 363 // Not text? try reading it as an image! 364 MBFImage readMBF = null; 365 if (typeValue.contains("gif")) { 366 // It is a gif! just download it normally (i.e. null image 367 // but not null URL) 368 readMBF = null; 369 } else { 370 // otherwise just try to read the damn image 371 readMBF = ImageUtilities.readMBF(bais); 372 } 373 final IndependentPair<URL, MBFImage> pair = IndependentPair.pair(url, readMBF); 374 image = Arrays.asList(pair); 375 StatusConsumer.logger.debug("Link resolved, returning image."); 376 return image; 377 } 378 } catch (final Throwable e) { // This input might not be an image! deal 379 // with that 380 this.reportFailedURL(url, e.getMessage()); 381 return null; 382 } 383 } 384 385 private void reportFailedURL(final URL url, final String reason) { 386 if (this.outputModes != null) { 387 for (final OutputListener listener : this.outputModes) { 388 listener.failedURL(url, reason); 389 } 390 } 391 } 392 393 /** 394 * Construct a file in the output location for a given url 395 * 396 * @param url 397 * @param outputLocation 398 * @return a file that looks like: outputLocation/protocol/path/query/... 399 * @throws IOException 400 */ 401 public static synchronized File urlToOutput(final URL url, final File outputLocation) throws IOException { 402 String urlPath = url.getProtocol() + File.separator + 403 url.getHost() + File.separator; 404 if (!url.getPath().equals("")) 405 urlPath += StatusConsumer.sanitizeFilename(url.getPath()) + File.separator; 406 if (url.getQuery() != null) 407 urlPath += StatusConsumer.sanitizeFilename(url.getQuery()) + File.separator; 408 409 final String outPath = outputLocation.getAbsolutePath() + File.separator + urlPath; 410 final File outFile = new File(outPath); 411 if (outFile.exists()) { 412 if (outFile.isDirectory()) { 413 return outFile; 414 } else { 415 StatusConsumer.createURLOutDir(outFile); 416 } 417 } else { 418 StatusConsumer.createURLOutDir(outFile); 419 } 420 return outFile; 421 } 422 423 /** 424 * Replaces illegal characters in a filename with "_" illegal characters : : 425 * \ / * ? | < > 426 * 427 * @param name 428 * @return Sanitised filename 429 */ 430 public static String sanitizeFilename(final String name) { 431 return name.replaceAll("[:\\\\/*?|<>]", "_"); 432 } 433 434 static void createURLOutDir(final File outFile) throws IOException { 435 if (!((!outFile.exists() || outFile.delete()) && outFile.mkdirs())) { 436 throw new IOException("Couldn't create URL output: " + outFile.getAbsolutePath()); 437 } 438 } 439 440}