001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.io;
031
032import java.io.ByteArrayInputStream;
033import java.io.ByteArrayOutputStream;
034import java.io.IOException;
035import java.io.InputStream;
036import java.net.HttpURLConnection;
037import java.net.MalformedURLException;
038import java.net.URISyntaxException;
039import java.net.URL;
040import java.util.regex.Matcher;
041import java.util.regex.Pattern;
042
043import org.apache.http.Header;
044import org.apache.http.HttpEntity;
045import org.apache.http.HttpHost;
046import org.apache.http.HttpRequest;
047import org.apache.http.HttpResponse;
048import org.apache.http.ProtocolException;
049import org.apache.http.client.RedirectStrategy;
050import org.apache.http.client.methods.HttpGet;
051import org.apache.http.client.methods.HttpHead;
052import org.apache.http.client.methods.HttpUriRequest;
053import org.apache.http.client.params.HttpClientParams;
054import org.apache.http.entity.BufferedHttpEntity;
055import org.apache.http.impl.client.DefaultHttpClient;
056import org.apache.http.impl.client.DefaultRedirectStrategy;
057import org.apache.http.params.BasicHttpParams;
058import org.apache.http.params.HttpConnectionParams;
059import org.apache.http.params.HttpParams;
060import org.apache.http.params.HttpProtocolParams;
061import org.apache.http.protocol.HttpContext;
062import org.jsoup.Jsoup;
063import org.jsoup.nodes.Document;
064import org.jsoup.select.Elements;
065import org.openimaj.util.pair.IndependentPair;
066
067/**
068 * HTTP(S) download utilities, with support for HTTP redirects and meta refresh
069 * redirection.
070 *
071 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
072 */
073public class HttpUtils {
074
075        /**
076         * The default user-agent string
077         */
078        public static final String DEFAULT_USERAGENT = "Mozilla/5.0 (Windows; U; Windows NT 6.0; ru; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)";
079
080        private HttpUtils() {
081        }
082
083        /**
084         * Read the contents of the given {@link URL} as an array of bytes.
085         * Redirects are followed automatically.
086         *
087         * @param u
088         *            the URL to read from
089         * @return the content referenced by the URL
090         * @throws IOException
091         *             if an error occurs
092         * @throws IllegalArgumentException
093         *             if the URL is not an HTTP(s) URL
094         */
095        public static byte[] readURLAsBytes(URL u) throws IOException {
096                return readURLAsBytes(u, true);
097        }
098
099        /**
100         * Read the contents of the given {@link URL} as an array of bytes. If
101         * redirects are not being followed, then the result will be null if the URL
102         * is redirected.
103         *
104         * @param u
105         *            the URL to read from
106         * @param followRedirects
107         *            should redirects be followed?
108         * @return the content referenced by the URL
109         * @throws IOException
110         *             if an error occurs
111         * @throws IllegalArgumentException
112         *             if the URL is not an HTTP(s) URL
113         */
114        public static byte[] readURLAsBytes(URL u, boolean followRedirects) throws IOException {
115                InputStream stream = readURLAsStream(u, followRedirects);
116                if (stream == null)
117                        return null;
118
119                try {
120                        return org.apache.commons.io.IOUtils.toByteArray(stream);
121                } finally {
122                        if (stream != null)
123                                stream.close();
124                }
125        }
126
127        /**
128         * A {@link RedirectStrategy} that can deal with meta-refresh style redirection
129         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
130         *
131         */
132        public static class MetaRefreshRedirectStrategy extends DefaultRedirectStrategy {
133                private static final String METAREFRESH_LOCATION = "METAREFRESH_LOCATION";
134
135                @Override
136                public boolean isRedirected(HttpRequest request, HttpResponse response, HttpContext context)
137                                throws ProtocolException
138                {
139                        boolean isRedirect = super.isRedirected(request, response, context);
140                        context.setAttribute(METAREFRESH_LOCATION, null);
141                        if (!isRedirect) {
142                                // Consume and buffer the entity, set the entity
143                                HttpEntity entity = null;
144                                try {
145                                        entity = response.getEntity();
146                                        if (!entity.isRepeatable())
147                                        {
148                                                entity = new BufferedHttpEntity(response.getEntity());
149                                                response.setEntity(entity); // Set the entity!
150                                        }
151                                        HttpHost host = (HttpHost) context.getAttribute("http.target_host");
152                                        URL url = new URL(host.toURI());
153
154
155                                        Header encodingObj = entity.getContentEncoding();
156                                        String encoding = null;
157                                        if (encodingObj == null) {
158                                                encoding = "UTF-8";
159                                        }
160                                        else {
161                                                encoding = encodingObj.getValue();
162                                                if (encoding == null) {
163                                                        encoding = "UTF-8";
164                                                }
165                                        }
166                                        URL u = checkRedirects(url, FileUtils.readall(entity.getContent(), encoding));
167                                        if (u != null) {
168                                                // set the location so it doesn't have to be read again
169                                                context.setAttribute(METAREFRESH_LOCATION, u);
170                                                return true;
171                                        }
172
173                                } catch (IOException e) {
174                                        return false;
175                                }
176                        }
177                        return isRedirect;
178                }
179
180                @Override
181                public HttpUriRequest getRedirect(HttpRequest request, HttpResponse response, HttpContext context)
182                                throws ProtocolException
183                {
184                        URL metarefresh = (URL) context.getAttribute(METAREFRESH_LOCATION);
185                        if (metarefresh == null) {
186                                return super.getRedirect(request, response, context);
187                        }
188
189                        String method = request.getRequestLine().getMethod();
190                        try {
191                                if (method.equalsIgnoreCase(HttpHead.METHOD_NAME)) {
192                                        return new HttpHead(metarefresh.toURI());
193                                } else {
194                                        return new HttpGet(metarefresh.toURI());
195                                }
196                        } catch (URISyntaxException e) {
197                                return super.getRedirect(request, response, context);
198                        }
199                }
200        }
201
202        /**
203         * Read the contents of the given {@link URL} as a
204         * {@link ByteArrayInputStream} (i.e. a byte[] in memory wrapped in an
205         * {@link InputStream}). If redirects are not being followed, then the
206         * result will be null if the URL is redirected.
207         *
208         * @param u
209         *            the URL to read from
210         * @param followRedirects
211         *            should redirects be followed?
212         * @return the content referenced by the URL
213         * @throws IOException
214         *             if an error occurs
215         * @throws IllegalArgumentException
216         *             if the URL is not an HTTP(s) URL
217         */
218        public static IndependentPair<HttpEntity, ByteArrayInputStream> readURLAsByteArrayInputStream(URL u,
219                        boolean followRedirects) throws IOException
220        {
221                return readURLAsByteArrayInputStream(u, 15000, 15000, followRedirects ? new MetaRefreshRedirectStrategy() : null,
222                                DEFAULT_USERAGENT);
223        }
224
225        /**
226         * Read the contents of the given {@link URL} as a
227         * {@link ByteArrayInputStream} (i.e. a byte[] in memory wrapped in an
228         * {@link InputStream}). If redirects are not being followed, then the
229         * result will be null if the URL is redirected.
230         *
231         * @param u
232         *            the URL to read from
233         * @param strategy
234         *            how redirects should be followed
235         * @return the content referenced by the URL
236         * @throws IOException
237         *             if an error occurs
238         * @throws IllegalArgumentException
239         *             if the URL is not an HTTP(s) URL
240         */
241        public static IndependentPair<HttpEntity, ByteArrayInputStream> readURLAsByteArrayInputStream(URL u,
242                        RedirectStrategy strategy) throws IOException
243        {
244                return readURLAsByteArrayInputStream(u, 15000, 15000, strategy, DEFAULT_USERAGENT);
245        }
246
247        /**
248         * Read the contents of the given {@link URL} as a
249         * {@link ByteArrayInputStream} (i.e. a byte[] in memory wrapped in an
250         * {@link InputStream}). If redirects are not being followed, then the
251         * result will be null if the URL is redirected.
252         *
253         * @param url
254         *            the URL to read from
255         * @param connectionTimeout
256         *            amount of time to wait for connection
257         * @param readTimeout
258         *            amount of time to wait for reading
259         * @param redirectStrategy the redirection strategy
260         * @param userAgent
261         *            the useragent string
262         * @return the content referenced by the URL
263         * @throws IOException
264         *             if an error occurs
265         * @throws IllegalArgumentException
266         *             if the URL is not an HTTP(s) URL
267         */
268        public static IndependentPair<HttpEntity, ByteArrayInputStream> readURLAsByteArrayInputStream(URL url,
269                        int connectionTimeout, int readTimeout, RedirectStrategy redirectStrategy, String userAgent)
270                        throws IOException
271        {
272                DefaultHttpClient c = null;
273                try{
274                        HttpParams params = new BasicHttpParams();
275                        HttpConnectionParams.setConnectionTimeout(params, connectionTimeout);
276                        HttpConnectionParams.setSoTimeout(params, readTimeout);
277                        HttpProtocolParams.setUserAgent(params, userAgent);
278                        HttpClientParams.setRedirecting(params, redirectStrategy != null);
279                        boolean followRedirects = redirectStrategy != null;
280                        c = new DefaultHttpClient(params);
281                        if (followRedirects)
282                                c.setRedirectStrategy(redirectStrategy);
283                        HttpResponse resp = null;
284                        try {
285                                resp = c.execute(new HttpGet(url.toURI()));
286                        } catch (URISyntaxException e) {
287                                throw new IOException(e);
288                        }
289
290                        ByteArrayOutputStream outStream = new ByteArrayOutputStream();
291                        InputStream stream = resp.getEntity().getContent();
292                        byte[] tempBuffer = new byte[1024];
293
294                        // read the rest!
295                        while (true) {
296                                int readThisTime = stream.read(tempBuffer);
297                                if (readThisTime == -1) {
298                                        break;
299                                }
300                                // write to the outStream
301                                outStream.write(tempBuffer, 0, readThisTime);
302                        }
303                        IndependentPair<HttpEntity, ByteArrayInputStream> toRet = IndependentPair.pair(resp.getEntity(), new ByteArrayInputStream(outStream.toByteArray()));;
304                        return toRet;
305                }
306                finally{
307                        if(c!=null) c.getConnectionManager().shutdown();
308                }
309
310        }
311
312        /**
313         * Open an {@link HttpURLConnection} to the {@link URL} as an array of
314         * bytes. Redirects are followed automatically.
315         *
316         * @param url
317         *            the URL to read from
318         * @return the content referenced by the URL
319         * @throws IOException
320         *             if an error occurs
321         * @throws IllegalArgumentException
322         *             if the URL is not an HTTP(s) URL
323         */
324        public static InputStream readURL(URL url) throws IOException {
325                return readURLAsByteArrayInputStream(url, 15000, 15000, new MetaRefreshRedirectStrategy(), DEFAULT_USERAGENT).getSecondObject();
326        }
327
328        /**
329         * Open an {@link HttpURLConnection} to the {@link URL} as an array of
330         * bytes.
331         *
332         * @param url
333         *            the URL to read from
334         * @param followRedirects
335         *            should redirects be followed?
336         * @return the content referenced by the URL
337         * @throws IOException
338         *             if an error occurs
339         * @throws IllegalArgumentException
340         *             if the URL is not an HTTP(s) URL
341         */
342        public static InputStream readURL(URL url, boolean followRedirects) throws IOException {
343                return readURLAsByteArrayInputStream(url, 15000, 15000, followRedirects ? new MetaRefreshRedirectStrategy() : null, DEFAULT_USERAGENT).getSecondObject();
344        }
345
346
347        private static URL searchMetaRefresh(URL base, String html) throws MalformedURLException {
348                Document doc = Jsoup.parse(html);
349
350                Elements tags = doc.select("meta[http-equiv=refresh]");
351                if (tags != null && tags.size() > 0) {
352                        String content = tags.first().attr("content");
353
354                        Pattern pattern = Pattern.compile("\\d+\\;url\\=(.*)",Pattern.CASE_INSENSITIVE);
355                        Matcher matcher = pattern.matcher(content);
356                        if (matcher.find()) {
357                                String url = matcher.group(1);
358
359                                URL toRet = null;
360                                if (url.contains("://")) {
361                                        toRet = new URL(url);
362                                }
363                                {
364                                        toRet = new URL(base, url);
365                                }
366                                // A legitimate use of http-refresh was to refresh the current
367                                // page
368                                // this would result in a horrible loop
369                                if (!toRet.equals(base)) {
370                                        return toRet;
371                                }
372                        }
373                }
374
375                return null;
376        }
377
378        private static URL checkRedirects(URL base, String html) throws IOException {
379                URL u = searchMetaRefresh(base, html);
380
381                // potentially add more checks here for things
382                // like JS refresh
383
384                return u;
385        }
386
387        /**
388         * Open a {@link InputStream} to the contents referenced by the {@link URL}.
389         * Redirects are followed automatically.
390         *
391         * @param url
392         *            the URL to read from
393         * @return the content referenced by the URL
394         * @throws IOException
395         *             if an error occurs
396         * @throws IllegalArgumentException
397         *             if the URL is not an HTTP(s) URL
398         */
399        public static InputStream readURLAsStream(URL url) throws IOException {
400                return readURL(url);
401        }
402
403        /**
404         * Open a {@link InputStream} to the contents referenced by the {@link URL}.
405         * If redirects are not being followed, then the result will be null if the
406         * URL is redirected.
407         *
408         * @param url
409         *            the URL to read from
410         * @param followRedirects
411         *            should redirects be followed.
412         * @return the content referenced by the URL
413         * @throws IOException
414         *             if an error occurs
415         * @throws IllegalArgumentException
416         *             if the URL is not an HTTP(s) URL
417         */
418        public static InputStream readURLAsStream(URL url, boolean followRedirects) throws IOException {
419                InputStream conn = readURL(url, followRedirects);
420
421                return conn;
422        }
423
424        /**
425         * Read the internal state of an object from the given URL.
426         *
427         * @param <T>
428         *            Type of object being read.
429         *
430         * @param url
431         *            the URL to read from
432         * @param obj
433         *            the object to fill
434         * @return the content referenced by the URL
435         * @throws IOException
436         *             if an error occurs
437         * @throws IllegalArgumentException
438         *             if the URL is not an HTTP(s) URL
439         */
440        public static <T extends InternalReadable> T readURL(URL url, T obj) throws IOException {
441                InputStream stream = readURLAsStream(url);
442
443                try {
444                        return IOUtils.read(stream, obj);
445                } finally {
446                        if (stream != null)
447                                stream.close();
448                }
449        }
450
451        /**
452         * Read the an object from the given URL.
453         *
454         * @param <T>
455         *            Type of object being read.
456         *
457         * @param url
458         *            the URL to read from
459         * @param clz
460         *            the class of the object to read
461         * @return the content referenced by the URL
462         * @throws IOException
463         *             if an error occurs
464         * @throws IllegalArgumentException
465         *             if the URL is not an HTTP(s) URL
466         */
467        public static <T extends InternalReadable> T readURL(URL url, Class<? extends T> clz) throws IOException {
468                InputStream stream = readURLAsStream(url);
469
470                try {
471                        return IOUtils.read(stream, clz);
472                } finally {
473                        if (stream != null)
474                                stream.close();
475                }
476        }
477
478        /**
479         * Read the an object from the given URL.
480         *
481         * @param <T>
482         *            Type of object being read.
483         * @param <Q>
484         *            Type of the object reader.
485         *
486         * @param url
487         *            the URL to read from
488         * @param reader
489         *            the reader that creates the object.
490         * @return the content referenced by the URL
491         * @throws IOException
492         *             if an error occurs
493         * @throws IllegalArgumentException
494         *             if the URL is not an HTTP(s) URL
495         */
496        public static <T, Q extends InputStreamObjectReader<T>> T readURL(URL url, Q reader) throws IOException {
497                InputStream stream = readURLAsStream(url);
498
499                try {
500                        return reader.read(stream);
501                } finally {
502                        if (stream != null)
503                                stream.close();
504                }
505        }
506}