001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.io; 031 032import java.io.ByteArrayInputStream; 033import java.io.ByteArrayOutputStream; 034import java.io.IOException; 035import java.io.InputStream; 036import java.net.HttpURLConnection; 037import java.net.MalformedURLException; 038import java.net.URISyntaxException; 039import java.net.URL; 040import java.util.regex.Matcher; 041import java.util.regex.Pattern; 042 043import org.apache.http.Header; 044import org.apache.http.HttpEntity; 045import org.apache.http.HttpHost; 046import org.apache.http.HttpRequest; 047import org.apache.http.HttpResponse; 048import org.apache.http.ProtocolException; 049import org.apache.http.client.RedirectStrategy; 050import org.apache.http.client.methods.HttpGet; 051import org.apache.http.client.methods.HttpHead; 052import org.apache.http.client.methods.HttpUriRequest; 053import org.apache.http.client.params.HttpClientParams; 054import org.apache.http.entity.BufferedHttpEntity; 055import org.apache.http.impl.client.DefaultHttpClient; 056import org.apache.http.impl.client.DefaultRedirectStrategy; 057import org.apache.http.params.BasicHttpParams; 058import org.apache.http.params.HttpConnectionParams; 059import org.apache.http.params.HttpParams; 060import org.apache.http.params.HttpProtocolParams; 061import org.apache.http.protocol.HttpContext; 062import org.jsoup.Jsoup; 063import org.jsoup.nodes.Document; 064import org.jsoup.select.Elements; 065import org.openimaj.util.pair.IndependentPair; 066 067/** 068 * HTTP(S) download utilities, with support for HTTP redirects and meta refresh 069 * redirection. 070 * 071 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 072 */ 073public class HttpUtils { 074 075 /** 076 * The default user-agent string 077 */ 078 public static final String DEFAULT_USERAGENT = "Mozilla/5.0 (Windows; U; Windows NT 6.0; ru; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)"; 079 080 private HttpUtils() { 081 } 082 083 /** 084 * Read the contents of the given {@link URL} as an array of bytes. 085 * Redirects are followed automatically. 086 * 087 * @param u 088 * the URL to read from 089 * @return the content referenced by the URL 090 * @throws IOException 091 * if an error occurs 092 * @throws IllegalArgumentException 093 * if the URL is not an HTTP(s) URL 094 */ 095 public static byte[] readURLAsBytes(URL u) throws IOException { 096 return readURLAsBytes(u, true); 097 } 098 099 /** 100 * Read the contents of the given {@link URL} as an array of bytes. If 101 * redirects are not being followed, then the result will be null if the URL 102 * is redirected. 103 * 104 * @param u 105 * the URL to read from 106 * @param followRedirects 107 * should redirects be followed? 108 * @return the content referenced by the URL 109 * @throws IOException 110 * if an error occurs 111 * @throws IllegalArgumentException 112 * if the URL is not an HTTP(s) URL 113 */ 114 public static byte[] readURLAsBytes(URL u, boolean followRedirects) throws IOException { 115 InputStream stream = readURLAsStream(u, followRedirects); 116 if (stream == null) 117 return null; 118 119 try { 120 return org.apache.commons.io.IOUtils.toByteArray(stream); 121 } finally { 122 if (stream != null) 123 stream.close(); 124 } 125 } 126 127 /** 128 * A {@link RedirectStrategy} that can deal with meta-refresh style redirection 129 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 130 * 131 */ 132 public static class MetaRefreshRedirectStrategy extends DefaultRedirectStrategy { 133 private static final String METAREFRESH_LOCATION = "METAREFRESH_LOCATION"; 134 135 @Override 136 public boolean isRedirected(HttpRequest request, HttpResponse response, HttpContext context) 137 throws ProtocolException 138 { 139 boolean isRedirect = super.isRedirected(request, response, context); 140 context.setAttribute(METAREFRESH_LOCATION, null); 141 if (!isRedirect) { 142 // Consume and buffer the entity, set the entity 143 HttpEntity entity = null; 144 try { 145 entity = response.getEntity(); 146 if (!entity.isRepeatable()) 147 { 148 entity = new BufferedHttpEntity(response.getEntity()); 149 response.setEntity(entity); // Set the entity! 150 } 151 HttpHost host = (HttpHost) context.getAttribute("http.target_host"); 152 URL url = new URL(host.toURI()); 153 154 155 Header encodingObj = entity.getContentEncoding(); 156 String encoding = null; 157 if (encodingObj == null) { 158 encoding = "UTF-8"; 159 } 160 else { 161 encoding = encodingObj.getValue(); 162 if (encoding == null) { 163 encoding = "UTF-8"; 164 } 165 } 166 URL u = checkRedirects(url, FileUtils.readall(entity.getContent(), encoding)); 167 if (u != null) { 168 // set the location so it doesn't have to be read again 169 context.setAttribute(METAREFRESH_LOCATION, u); 170 return true; 171 } 172 173 } catch (IOException e) { 174 return false; 175 } 176 } 177 return isRedirect; 178 } 179 180 @Override 181 public HttpUriRequest getRedirect(HttpRequest request, HttpResponse response, HttpContext context) 182 throws ProtocolException 183 { 184 URL metarefresh = (URL) context.getAttribute(METAREFRESH_LOCATION); 185 if (metarefresh == null) { 186 return super.getRedirect(request, response, context); 187 } 188 189 String method = request.getRequestLine().getMethod(); 190 try { 191 if (method.equalsIgnoreCase(HttpHead.METHOD_NAME)) { 192 return new HttpHead(metarefresh.toURI()); 193 } else { 194 return new HttpGet(metarefresh.toURI()); 195 } 196 } catch (URISyntaxException e) { 197 return super.getRedirect(request, response, context); 198 } 199 } 200 } 201 202 /** 203 * Read the contents of the given {@link URL} as a 204 * {@link ByteArrayInputStream} (i.e. a byte[] in memory wrapped in an 205 * {@link InputStream}). If redirects are not being followed, then the 206 * result will be null if the URL is redirected. 207 * 208 * @param u 209 * the URL to read from 210 * @param followRedirects 211 * should redirects be followed? 212 * @return the content referenced by the URL 213 * @throws IOException 214 * if an error occurs 215 * @throws IllegalArgumentException 216 * if the URL is not an HTTP(s) URL 217 */ 218 public static IndependentPair<HttpEntity, ByteArrayInputStream> readURLAsByteArrayInputStream(URL u, 219 boolean followRedirects) throws IOException 220 { 221 return readURLAsByteArrayInputStream(u, 15000, 15000, followRedirects ? new MetaRefreshRedirectStrategy() : null, 222 DEFAULT_USERAGENT); 223 } 224 225 /** 226 * Read the contents of the given {@link URL} as a 227 * {@link ByteArrayInputStream} (i.e. a byte[] in memory wrapped in an 228 * {@link InputStream}). If redirects are not being followed, then the 229 * result will be null if the URL is redirected. 230 * 231 * @param u 232 * the URL to read from 233 * @param strategy 234 * how redirects should be followed 235 * @return the content referenced by the URL 236 * @throws IOException 237 * if an error occurs 238 * @throws IllegalArgumentException 239 * if the URL is not an HTTP(s) URL 240 */ 241 public static IndependentPair<HttpEntity, ByteArrayInputStream> readURLAsByteArrayInputStream(URL u, 242 RedirectStrategy strategy) throws IOException 243 { 244 return readURLAsByteArrayInputStream(u, 15000, 15000, strategy, DEFAULT_USERAGENT); 245 } 246 247 /** 248 * Read the contents of the given {@link URL} as a 249 * {@link ByteArrayInputStream} (i.e. a byte[] in memory wrapped in an 250 * {@link InputStream}). If redirects are not being followed, then the 251 * result will be null if the URL is redirected. 252 * 253 * @param url 254 * the URL to read from 255 * @param connectionTimeout 256 * amount of time to wait for connection 257 * @param readTimeout 258 * amount of time to wait for reading 259 * @param redirectStrategy the redirection strategy 260 * @param userAgent 261 * the useragent string 262 * @return the content referenced by the URL 263 * @throws IOException 264 * if an error occurs 265 * @throws IllegalArgumentException 266 * if the URL is not an HTTP(s) URL 267 */ 268 public static IndependentPair<HttpEntity, ByteArrayInputStream> readURLAsByteArrayInputStream(URL url, 269 int connectionTimeout, int readTimeout, RedirectStrategy redirectStrategy, String userAgent) 270 throws IOException 271 { 272 DefaultHttpClient c = null; 273 try{ 274 HttpParams params = new BasicHttpParams(); 275 HttpConnectionParams.setConnectionTimeout(params, connectionTimeout); 276 HttpConnectionParams.setSoTimeout(params, readTimeout); 277 HttpProtocolParams.setUserAgent(params, userAgent); 278 HttpClientParams.setRedirecting(params, redirectStrategy != null); 279 boolean followRedirects = redirectStrategy != null; 280 c = new DefaultHttpClient(params); 281 if (followRedirects) 282 c.setRedirectStrategy(redirectStrategy); 283 HttpResponse resp = null; 284 try { 285 resp = c.execute(new HttpGet(url.toURI())); 286 } catch (URISyntaxException e) { 287 throw new IOException(e); 288 } 289 290 ByteArrayOutputStream outStream = new ByteArrayOutputStream(); 291 InputStream stream = resp.getEntity().getContent(); 292 byte[] tempBuffer = new byte[1024]; 293 294 // read the rest! 295 while (true) { 296 int readThisTime = stream.read(tempBuffer); 297 if (readThisTime == -1) { 298 break; 299 } 300 // write to the outStream 301 outStream.write(tempBuffer, 0, readThisTime); 302 } 303 IndependentPair<HttpEntity, ByteArrayInputStream> toRet = IndependentPair.pair(resp.getEntity(), new ByteArrayInputStream(outStream.toByteArray()));; 304 return toRet; 305 } 306 finally{ 307 if(c!=null) c.getConnectionManager().shutdown(); 308 } 309 310 } 311 312 /** 313 * Open an {@link HttpURLConnection} to the {@link URL} as an array of 314 * bytes. Redirects are followed automatically. 315 * 316 * @param url 317 * the URL to read from 318 * @return the content referenced by the URL 319 * @throws IOException 320 * if an error occurs 321 * @throws IllegalArgumentException 322 * if the URL is not an HTTP(s) URL 323 */ 324 public static InputStream readURL(URL url) throws IOException { 325 return readURLAsByteArrayInputStream(url, 15000, 15000, new MetaRefreshRedirectStrategy(), DEFAULT_USERAGENT).getSecondObject(); 326 } 327 328 /** 329 * Open an {@link HttpURLConnection} to the {@link URL} as an array of 330 * bytes. 331 * 332 * @param url 333 * the URL to read from 334 * @param followRedirects 335 * should redirects be followed? 336 * @return the content referenced by the URL 337 * @throws IOException 338 * if an error occurs 339 * @throws IllegalArgumentException 340 * if the URL is not an HTTP(s) URL 341 */ 342 public static InputStream readURL(URL url, boolean followRedirects) throws IOException { 343 return readURLAsByteArrayInputStream(url, 15000, 15000, followRedirects ? new MetaRefreshRedirectStrategy() : null, DEFAULT_USERAGENT).getSecondObject(); 344 } 345 346 347 private static URL searchMetaRefresh(URL base, String html) throws MalformedURLException { 348 Document doc = Jsoup.parse(html); 349 350 Elements tags = doc.select("meta[http-equiv=refresh]"); 351 if (tags != null && tags.size() > 0) { 352 String content = tags.first().attr("content"); 353 354 Pattern pattern = Pattern.compile("\\d+\\;url\\=(.*)",Pattern.CASE_INSENSITIVE); 355 Matcher matcher = pattern.matcher(content); 356 if (matcher.find()) { 357 String url = matcher.group(1); 358 359 URL toRet = null; 360 if (url.contains("://")) { 361 toRet = new URL(url); 362 } 363 { 364 toRet = new URL(base, url); 365 } 366 // A legitimate use of http-refresh was to refresh the current 367 // page 368 // this would result in a horrible loop 369 if (!toRet.equals(base)) { 370 return toRet; 371 } 372 } 373 } 374 375 return null; 376 } 377 378 private static URL checkRedirects(URL base, String html) throws IOException { 379 URL u = searchMetaRefresh(base, html); 380 381 // potentially add more checks here for things 382 // like JS refresh 383 384 return u; 385 } 386 387 /** 388 * Open a {@link InputStream} to the contents referenced by the {@link URL}. 389 * Redirects are followed automatically. 390 * 391 * @param url 392 * the URL to read from 393 * @return the content referenced by the URL 394 * @throws IOException 395 * if an error occurs 396 * @throws IllegalArgumentException 397 * if the URL is not an HTTP(s) URL 398 */ 399 public static InputStream readURLAsStream(URL url) throws IOException { 400 return readURL(url); 401 } 402 403 /** 404 * Open a {@link InputStream} to the contents referenced by the {@link URL}. 405 * If redirects are not being followed, then the result will be null if the 406 * URL is redirected. 407 * 408 * @param url 409 * the URL to read from 410 * @param followRedirects 411 * should redirects be followed. 412 * @return the content referenced by the URL 413 * @throws IOException 414 * if an error occurs 415 * @throws IllegalArgumentException 416 * if the URL is not an HTTP(s) URL 417 */ 418 public static InputStream readURLAsStream(URL url, boolean followRedirects) throws IOException { 419 InputStream conn = readURL(url, followRedirects); 420 421 return conn; 422 } 423 424 /** 425 * Read the internal state of an object from the given URL. 426 * 427 * @param <T> 428 * Type of object being read. 429 * 430 * @param url 431 * the URL to read from 432 * @param obj 433 * the object to fill 434 * @return the content referenced by the URL 435 * @throws IOException 436 * if an error occurs 437 * @throws IllegalArgumentException 438 * if the URL is not an HTTP(s) URL 439 */ 440 public static <T extends InternalReadable> T readURL(URL url, T obj) throws IOException { 441 InputStream stream = readURLAsStream(url); 442 443 try { 444 return IOUtils.read(stream, obj); 445 } finally { 446 if (stream != null) 447 stream.close(); 448 } 449 } 450 451 /** 452 * Read the an object from the given URL. 453 * 454 * @param <T> 455 * Type of object being read. 456 * 457 * @param url 458 * the URL to read from 459 * @param clz 460 * the class of the object to read 461 * @return the content referenced by the URL 462 * @throws IOException 463 * if an error occurs 464 * @throws IllegalArgumentException 465 * if the URL is not an HTTP(s) URL 466 */ 467 public static <T extends InternalReadable> T readURL(URL url, Class<? extends T> clz) throws IOException { 468 InputStream stream = readURLAsStream(url); 469 470 try { 471 return IOUtils.read(stream, clz); 472 } finally { 473 if (stream != null) 474 stream.close(); 475 } 476 } 477 478 /** 479 * Read the an object from the given URL. 480 * 481 * @param <T> 482 * Type of object being read. 483 * @param <Q> 484 * Type of the object reader. 485 * 486 * @param url 487 * the URL to read from 488 * @param reader 489 * the reader that creates the object. 490 * @return the content referenced by the URL 491 * @throws IOException 492 * if an error occurs 493 * @throws IllegalArgumentException 494 * if the URL is not an HTTP(s) URL 495 */ 496 public static <T, Q extends InputStreamObjectReader<T>> T readURL(URL url, Q reader) throws IOException { 497 InputStream stream = readURLAsStream(url); 498 499 try { 500 return reader.read(stream); 501 } finally { 502 if (stream != null) 503 stream.close(); 504 } 505 } 506}