001/** 002 * Copyright 2011 The University of Southampton, Yahoo Inc., and the 003 * individual contributors. All rights reserved. 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.openimaj.web.scraping.images; 018 019import java.net.URL; 020import java.util.Arrays; 021import java.util.List; 022 023import org.jsoup.Jsoup; 024import org.jsoup.nodes.Document; 025import org.jsoup.select.Elements; 026import org.openimaj.web.scraping.SiteSpecificConsumer; 027 028/** 029 * Download images from twitter's own image hosting service 030 * 031 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 032 * 033 */ 034public class TwitterPhotoConsumer implements SiteSpecificConsumer { 035 @Override 036 public boolean canConsume(URL url) { 037 // http://twitter.com/HutchSelenator/status/222772697531301890/photo/1 038 return url.getHost().equals("twitter.com") && url.getPath().contains("photo"); 039 } 040 041 @Override 042 public List<URL> consume(URL url) { 043 String largeURLStr = url.toString(); 044 if (!largeURLStr.endsWith("large")) { 045 largeURLStr += "/large"; 046 } 047 try { 048 final Document doc = Jsoup.connect(largeURLStr).get(); 049 final Elements largeimage = doc.select(".media-slideshow-image"); 050 final URL link = new URL(largeimage.get(0).attr("src")); 051 return Arrays.asList(link); 052 } catch (final Exception e) { 053 return null; 054 } 055 056 } 057 058}