Source code

001package org.openimaj.picslurper;
002
003import java.io.ByteArrayInputStream;
004import java.io.File;
005import java.io.IOException;
006import java.net.MalformedURLException;
007import java.net.URL;
008import java.util.ArrayList;
009import java.util.Arrays;
010import java.util.HashSet;
011import java.util.List;
012import java.util.Set;
013import java.util.regex.Matcher;
014import java.util.regex.Pattern;
015
016import org.apache.commons.io.FileUtils;
017import org.apache.http.HttpEntity;
018import org.apache.http.HttpRequest;
019import org.apache.http.HttpResponse;
020import org.apache.http.ProtocolException;
021import org.apache.http.protocol.HttpContext;
022import org.apache.log4j.Logger;
023import org.openimaj.image.ImageUtilities;
024import org.openimaj.image.MBFImage;
025import org.openimaj.io.HttpUtils;
026import org.openimaj.io.HttpUtils.MetaRefreshRedirectStrategy;
027import org.openimaj.picslurper.output.OutputListener;
028import org.openimaj.picslurper.output.WriteableImageOutput;
029import org.openimaj.text.nlp.patterns.URLPatternProvider;
030import org.openimaj.twitter.collection.StreamJSONStatusList.ReadableWritableJSON;
031import org.openimaj.util.pair.IndependentPair;
032import org.openimaj.web.scraping.SiteSpecificConsumer;
033import org.openimaj.web.scraping.images.CommonHTMLConsumers;
034import org.openimaj.web.scraping.images.FacebookConsumer;
035import org.openimaj.web.scraping.images.ImgurConsumer;
036import org.openimaj.web.scraping.images.InstagramConsumer;
037import org.openimaj.web.scraping.images.OwlyImageConsumer;
038import org.openimaj.web.scraping.images.TmblrPhotoConsumer;
039import org.openimaj.web.scraping.images.TwipleConsumer;
040import org.openimaj.web.scraping.images.TwitPicConsumer;
041import org.openimaj.web.scraping.images.TwitterPhotoConsumer;
042import org.openimaj.web.scraping.images.YfrogConsumer;
043
044import twitter4j.Status;
045import twitter4j.URLEntity;
046
047/**
048 * A status consumer knows how to consume a {@link ReadableWritableJSON} and
049 * output image files. Currently this {@link StatusConsumer} only understands
050 * Twitter JSON, perhaps making it abstract and turning {@link #consume(Status)}
051 * into an abstract function that can deal with other types of status would be
052 * sensible
053 * 
054 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
055 * 
056 */
057public class StatusConsumer {
058
059        /**
060         * The logger
061         */
062        public static Logger logger = Logger.getLogger(StatusConsumer.class);
063
064        final static Pattern urlPattern = new URLPatternProvider().pattern();
065        /**
066         * the site specific consumers
067         */
068        public final static List<SiteSpecificConsumer> siteSpecific = new ArrayList<SiteSpecificConsumer>();
069        static {
070                StatusConsumer.siteSpecific.add(new InstagramConsumer());
071                StatusConsumer.siteSpecific.add(new TwitterPhotoConsumer());
072                // StatusConsumer.siteSpecific.add(new TmblrPhotoConsumer());
073                StatusConsumer.siteSpecific.add(new TwitPicConsumer());
074                StatusConsumer.siteSpecific.add(new ImgurConsumer());
075                StatusConsumer.siteSpecific.add(new FacebookConsumer());
076                StatusConsumer.siteSpecific.add(new YfrogConsumer());
077                StatusConsumer.siteSpecific.add(new OwlyImageConsumer());
078                StatusConsumer.siteSpecific.add(new TwipleConsumer());
079                StatusConsumer.siteSpecific.add(CommonHTMLConsumers.FOTOLOG);
080                StatusConsumer.siteSpecific.add(CommonHTMLConsumers.PHOTONUI);
081                StatusConsumer.siteSpecific.add(CommonHTMLConsumers.PICS_LOCKERZ);
082        }
083        private boolean outputStats;
084        private File globalStats;
085        private File outputLocation;
086
087        private final Set<String> toProcess;
088
089        private final HashSet<String> previouslySeen;
090
091        private List<OutputListener> outputModes;
092
093        /**
094         * @param outputStats
095         *            whether statistics should be outputted
096         * @param globalStats
097         *            the global statistics file
098         * @param outputLocation
099         *            the output location for this status
100         * @param outputModes
101         *            the output modes informed on image downloads
102         * 
103         */
104        public StatusConsumer(final boolean outputStats, final File globalStats, final File outputLocation,
105                        final List<OutputListener> outputModes)
106        {
107                this();
108                this.outputStats = outputStats;
109                this.globalStats = globalStats;
110                this.outputLocation = outputLocation;
111                this.outputModes = outputModes;
112
113        }
114
115        /**
116         * for convenience
117         */
118        public StatusConsumer() {
119                this.previouslySeen = new HashSet<String>();
120                this.toProcess = new HashSet<String>();
121        }
122
123        class LoggingStatus {
124                List<String> strings = new ArrayList<String>();
125        }
126
127        /**
128         * @param status
129         * @return the statistics of the consumption
130         * @throws Exception
131         */
132        public StatusConsumption consume(final Status status) throws Exception {
133                StatusConsumption cons;
134                // Now add all the entries from entities.urls
135
136                if (status.getURLEntities() != null) {
137
138                        for (final URLEntity map : status.getURLEntities()) {
139                                String u = map.getExpandedURL();
140                                if (u == null) {
141                                        u = map.getURL();
142                                }
143                                if (u == null)
144                                        continue;
145                                final String eurl = u.toString();
146                                if (eurl == null)
147                                        continue;
148                                this.add(eurl);
149                        }
150                }
151                // Find the URLs in the raw text
152                final String text = status.getText();
153                if (text != null) { // why was text null?
154                        final Matcher matcher = StatusConsumer.urlPattern.matcher(text);
155                        while (matcher.find()) {
156                                final String urlString = text.substring(matcher.start(), matcher.end());
157                                this.add(urlString);
158                        }
159                }
160
161                // now go through all the links and process them (i.e. download them)
162                cons = this.processAll(status);
163
164                if (this.outputStats)
165                        PicSlurperUtils.updateStats(this.globalStats, cons, true);
166                return cons;
167        }
168
169        /**
170         * Process all added URLs
171         * 
172         * @param status
173         * @return the {@link StatusConsumption} statistics
174         * @throws IOException
175         */
176        public StatusConsumption processAll(final Status status) throws IOException {
177                final StatusConsumption cons = new StatusConsumption();
178                cons.nTweets = 1;
179                cons.nURLs = 0;
180                while (this.toProcess.size() > 0) {
181                        final String url = this.toProcess.iterator().next();
182                        this.toProcess.remove(url);
183                        cons.nURLs++;
184                        final File urlOut = this.resolveURL(new URL(url), cons);
185                        if (urlOut != null) {
186                                final File outStats = new File(urlOut, "status.txt");
187                                PicSlurperUtils.updateStats(outStats, cons);
188                                PicSlurperUtils.updateTweets(urlOut, status);
189                                for (final OutputListener outputMode : this.outputModes) {
190                                        outputMode.newImageDownloaded(new WriteableImageOutput(status, new URL(url), urlOut, cons));
191                                }
192                        }
193
194                }
195                return cons;
196        }
197
198        /**
199         * Add a URL to process without allowing already seen URLs to be added
200         * 
201         * @param newURL
202         */
203        public void add(final String newURL) {
204                boolean add = true;
205                for (final String string : this.previouslySeen) {
206                        if (string.startsWith(newURL) || newURL.startsWith(string) || newURL.equals(string)) {
207                                add = false;
208                                break;
209                        }
210                }
211                if (add) {
212                        StatusConsumer.logger.debug("New URL added to list: " + newURL);
213                        this.toProcess.add(newURL);
214                        this.previouslySeen.add(newURL);
215                } else {
216                        StatusConsumer.logger.debug("URL not added, already exists: " + newURL);
217                }
218        }
219
220        /**
221         * Given a URL, use {@link #urlToImage(URL)} to turn the url into a list of
222         * images and write the images into the output location using the names
223         * "image_N.png"
224         * 
225         * @param url
226         * @param cons
227         *            the consumption stats
228         * @return the root output location
229         */
230        public File resolveURL(final URL url, final StatusConsumption cons) {
231                final List<IndependentPair<URL, MBFImage>> image = this.urlToImage(url);
232                if (image == null)
233                        return null;
234                File outputDir;
235                try {
236                        if (this.outputLocation == null)
237                                return null;
238                        outputDir = StatusConsumer.urlToOutput(url, this.outputLocation);
239                        cons.nTweets++;
240                        int n = 0;
241                        for (final IndependentPair<URL, MBFImage> mbfImage : image) {
242                                final URL urlReadFrom = mbfImage.firstObject();
243                                final MBFImage imageToWrite = mbfImage.secondObject();
244                                File outImage = null;
245                                if (imageToWrite == null) {
246                                        StatusConsumer.logger.debug("Downloading a raw GIF");
247                                        // For now this is the signal that we have a GIF. Write the
248                                        // gif.
249                                        outImage = new File(outputDir, String.format("image_%d.gif", n++));
250                                        final byte[] value = HttpUtils.readURLAsBytes(urlReadFrom, false);
251                                        FileUtils.writeByteArrayToFile(outImage, value);
252                                } else {
253                                        StatusConsumer.logger.debug("Downloading a normal image");
254                                        outImage = new File(outputDir, String.format("image_%d.png", n++));
255                                        ImageUtilities.write(imageToWrite, outImage);
256                                }
257                                cons.nImages++;
258                                cons.imageURLs.add(urlReadFrom);
259                        }
260                        return outputDir;
261                } catch (final IOException e) {
262                        e.printStackTrace();
263                }
264                return null;
265
266        }
267
268        /**
269         * An extention of the {@link MetaRefreshRedirectStrategy} which disallows
270         * all redirects and instead remembers a redirect for use later on.
271         * 
272         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
273         * 
274         */
275        public static class StatusConsumerRedirectStrategy extends MetaRefreshRedirectStrategy {
276                private boolean wasRedirected = false;
277                private URL redirection;
278
279                @Override
280                public boolean isRedirected(final HttpRequest request, final HttpResponse response, final HttpContext context)
281                                throws ProtocolException
282                {
283                        this.wasRedirected = super.isRedirected(request, response, context);
284
285                        if (this.wasRedirected) {
286                                try {
287                                        this.redirection = this.getRedirect(request, response, context).getURI().toURL();
288                                } catch (final MalformedURLException e) {
289                                        this.wasRedirected = false;
290                                }
291                        }
292                        return false;
293                }
294
295                /**
296                 * @return whether a redirect was found
297                 */
298                public boolean wasRedirected() {
299                        return this.wasRedirected;
300                }
301
302                /**
303                 * @return the redirection
304                 */
305                public URL redirection() {
306                        return this.redirection;
307                }
308        }
309
310        /**
311         * First, try all the {@link SiteSpecificConsumer} instances loaded into
312         * {@link #siteSpecific}. If any consumer takes control of a link the
313         * consumer's output is used
314         * 
315         * if this fails use
316         * {@link HttpUtils#readURLAsByteArrayInputStream(URL, org.apache.http.client.RedirectStrategy)}
317         * with a {@link StatusConsumerRedirectStrategy} which specifically
318         * disallows redirects to be dealt with automatically and forces this
319         * function to be called for each redirect.
320         * 
321         * 
322         * @param url
323         * @return a list of images or null
324         */
325        @SuppressWarnings("unchecked")
326        public List<IndependentPair<URL, MBFImage>> urlToImage(final URL url) {
327                StatusConsumer.logger.debug("Resolving URL: " + url);
328                StatusConsumer.logger.debug("Attempting site specific consumers");
329                List<IndependentPair<URL, MBFImage>> image = null;
330                for (final SiteSpecificConsumer consumer : StatusConsumer.siteSpecific) {
331                        if (consumer.canConsume(url)) {
332                                StatusConsumer.logger.debug("Site specific consumer: " + consumer.getClass().getName()
333                                                + " working on link");
334                                final List<URL> urlList = consumer.consume(url);
335                                if (urlList != null && !urlList.isEmpty()) {
336                                        StatusConsumer.logger.debug("Site specific consumer returned non-null, adding the URLs");
337                                        for (final URL siteSpecific : urlList) {
338                                                this.add(siteSpecific.toString());
339                                        }
340                                        return image;
341                                }
342                        }
343                }
344                try {
345                        StatusConsumer.logger.debug("Site specific consumers failed, trying the raw link");
346                        final StatusConsumerRedirectStrategy redirector = new StatusConsumerRedirectStrategy();
347                        final IndependentPair<HttpEntity, ByteArrayInputStream> headersBais = HttpUtils
348                                        .readURLAsByteArrayInputStream(url, 1000, 1000, redirector, HttpUtils.DEFAULT_USERAGENT);
349                        if (redirector.wasRedirected()) {
350                                StatusConsumer.logger.debug("Redirect intercepted, adding redirection to list");
351                                final String redirect = redirector.redirection().toString();
352                                if (!redirect.equals(url.toString()))
353                                        this.add(redirect);
354                                return null;
355                        }
356                        final HttpEntity headers = headersBais.firstObject();
357                        final ByteArrayInputStream bais = headersBais.getSecondObject();
358                        final String typeValue = headers.getContentType().getValue();
359                        if (typeValue.contains("text")) {
360                                this.reportFailedURL(url, "text content");
361                                return null;
362                        } else {
363                                // Not text? try reading it as an image!
364                                MBFImage readMBF = null;
365                                if (typeValue.contains("gif")) {
366                                        // It is a gif! just download it normally (i.e. null image
367                                        // but not null URL)
368                                        readMBF = null;
369                                } else {
370                                        // otherwise just try to read the damn image
371                                        readMBF = ImageUtilities.readMBF(bais);
372                                }
373                                final IndependentPair<URL, MBFImage> pair = IndependentPair.pair(url, readMBF);
374                                image = Arrays.asList(pair);
375                                StatusConsumer.logger.debug("Link resolved, returning image.");
376                                return image;
377                        }
378                } catch (final Throwable e) { // This input might not be an image! deal
379                        // with that
380                        this.reportFailedURL(url, e.getMessage());
381                        return null;
382                }
383        }
384
385        private void reportFailedURL(final URL url, final String reason) {
386                if (this.outputModes != null) {
387                        for (final OutputListener listener : this.outputModes) {
388                                listener.failedURL(url, reason);
389                        }
390                }
391        }
392
393        /**
394         * Construct a file in the output location for a given url
395         * 
396         * @param url
397         * @param outputLocation
398         * @return a file that looks like: outputLocation/protocol/path/query/...
399         * @throws IOException
400         */
401        public static synchronized File urlToOutput(final URL url, final File outputLocation) throws IOException {
402                String urlPath = url.getProtocol() + File.separator +
403                                url.getHost() + File.separator;
404                if (!url.getPath().equals(""))
405                        urlPath += StatusConsumer.sanitizeFilename(url.getPath()) + File.separator;
406                if (url.getQuery() != null)
407                        urlPath += StatusConsumer.sanitizeFilename(url.getQuery()) + File.separator;
408
409                final String outPath = outputLocation.getAbsolutePath() + File.separator + urlPath;
410                final File outFile = new File(outPath);
411                if (outFile.exists()) {
412                        if (outFile.isDirectory()) {
413                                return outFile;
414                        } else {
415                                StatusConsumer.createURLOutDir(outFile);
416                        }
417                } else {
418                        StatusConsumer.createURLOutDir(outFile);
419                }
420                return outFile;
421        }
422
423        /**
424         * Replaces illegal characters in a filename with "_" illegal characters : :
425         * \ / * ? | < >
426         * 
427         * @param name
428         * @return Sanitised filename
429         */
430        public static String sanitizeFilename(final String name) {
431                return name.replaceAll("[:\\\\/*?|<>]", "_");
432        }
433
434        static void createURLOutDir(final File outFile) throws IOException {
435                if (!((!outFile.exists() || outFile.delete()) && outFile.mkdirs())) {
436                        throw new IOException("Couldn't create URL output: " + outFile.getAbsolutePath());
437                }
438        }
439
440}