001/** 002 * Copyright (c) 2012, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.tools.twitter.options; 031 032import java.util.ArrayList; 033import java.util.List; 034 035import org.kohsuke.args4j.CmdLineException; 036import org.kohsuke.args4j.CmdLineParser; 037import org.kohsuke.args4j.Option; 038import org.kohsuke.args4j.ProxyOptionHandler; 039import org.openimaj.tools.InOutToolOptions; 040import org.openimaj.tools.twitter.modes.filter.TwitterPreprocessingFilterOption; 041import org.openimaj.tools.twitter.modes.filter.TwitterPreprocessingPredicate; 042import org.openimaj.tools.twitter.modes.output.TwitterOutputMode; 043import org.openimaj.tools.twitter.modes.output.TwitterOutputModeOption; 044import org.openimaj.tools.twitter.modes.preprocessing.TwitterPreprocessingMode; 045import org.openimaj.tools.twitter.modes.preprocessing.TwitterPreprocessingModeOption; 046import org.openimaj.twitter.GeneralJSON; 047import org.openimaj.twitter.GeneralJSONRDF; 048import org.openimaj.twitter.USMFStatus; 049import org.openimaj.twitter.collection.TwitterStatusListUtils; 050 051/** 052 * An abstract kind of twitter processing tool. Contains all the options generic 053 * to this kind of tool, not dependant on files or hadoop or whatever. 054 * 055 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 056 * 057 */ 058public abstract class AbstractTwitterPreprocessingToolOptions extends InOutToolOptions { 059 060 @Option( 061 name = "--mode", 062 aliases = "-m", 063 required = false, 064 usage = "How should the tweets be processed.", 065 handler = ProxyOptionHandler.class, 066 multiValued = true) 067 List<TwitterPreprocessingModeOption> modeOptions = new ArrayList<TwitterPreprocessingModeOption>(); 068 /** 069 * The preprocessing to perform 070 */ 071 public List<TwitterPreprocessingMode<?>> modeOptionsOp = new ArrayList<TwitterPreprocessingMode<?>>(); 072 073 @Option( 074 name = "--pre-filter", 075 aliases = "-prf", 076 required = false, 077 usage = "Define filters. Applied before other processing.", 078 handler = ProxyOptionHandler.class, 079 multiValued = true) 080 List<TwitterPreprocessingFilterOption> preFilterOptions = new ArrayList<TwitterPreprocessingFilterOption>(); 081 /** 082 * The prefiltering to perform 083 */ 084 public List<TwitterPreprocessingPredicate> preFilterOptionsOp = new ArrayList<TwitterPreprocessingPredicate>(); 085 086 @Option( 087 name = "--post-filter", 088 aliases = "-pof", 089 required = false, 090 usage = "Define filters. Applied after other processing", 091 handler = ProxyOptionHandler.class, 092 multiValued = true) 093 List<TwitterPreprocessingFilterOption> postFilterOptions = new ArrayList<TwitterPreprocessingFilterOption>(); 094 /** 095 * the postfiltering to perform 096 */ 097 public List<TwitterPreprocessingPredicate> postFilterOptionsOp = new ArrayList<TwitterPreprocessingPredicate>(); 098 // 099 @Option( 100 name = "--encoding", 101 aliases = "-e", 102 required = false, 103 usage = "The outputstreamwriter's text encoding", 104 metaVar = "STRING") 105 String encoding = "UTF-8"; 106 107 @Option( 108 name = "--output-mode", 109 aliases = "-om", 110 required = false, 111 usage = "How should the analysis be outputed.", 112 handler = ProxyOptionHandler.class) 113 TwitterOutputModeOption outputModeOption = TwitterOutputModeOption.APPEND; 114 TwitterOutputMode outputModeOptionOp = TwitterOutputModeOption.APPEND.getOptions(); 115 116 @Option( 117 name = "--n-tweets", 118 aliases = "-n", 119 required = false, 120 usage = "How many tweets from the input should this be applied to.", 121 handler = ProxyOptionHandler.class) 122 int nTweets = -1; 123 124 @Option(name = "--quiet", aliases = "-q", required = false, usage = "Control the progress messages.") 125 boolean quiet = false; 126 127 @Option(name = "--verbose", aliases = "-v", required = false, usage = "Be very loud (overrides queit)") 128 boolean veryLoud = false; 129 130 @Option( 131 name = "--time-before-skip", 132 aliases = "-t", 133 required = false, 134 usage = "Time to wait before skipping an entry") 135 long timeBeforeSkip = 0; 136 137 /** 138 * the status type to take as input 139 */ 140 @Option( 141 name = "--input-type", 142 aliases = "-it", 143 required = false, 144 usage = "The type of social media message being consumed") 145 public StatusType statusType = StatusType.TWITTER; 146 147 /** 148 * the status type to output 149 */ 150 @Option(name = "--output-type", aliases = "-ot", required = false, usage = "How to output, defaults to USMF") 151 public StatusType outputStatusType = StatusType.USMF; 152 153 private String[] args; 154 155 /** 156 * @param args 157 * the arguments, prepared using the prepare method 158 * @param prepare 159 * whether prepare should be called now or later 160 */ 161 public AbstractTwitterPreprocessingToolOptions(String[] args, boolean prepare) throws CmdLineException{ 162 this.args = args; 163 if (prepare) 164 this.prepare(); 165 } 166 167 /** 168 * @param args 169 * the arguments, prepared using the prepare method 170 */ 171 public AbstractTwitterPreprocessingToolOptions(String[] args) throws CmdLineException{ 172 this(args, true); 173 } 174 175 /** 176 * prepare the tool for running 177 */ 178 public void prepare() throws CmdLineException{ 179 final CmdLineParser parser = new CmdLineParser(this); 180 try { 181 if (veryLoud && quiet) { 182 quiet = false; 183 veryLoud = true; 184 } 185 parser.parseArgument(args); 186 InOutToolOptions.prepareMultivaluedArgument(modeOptions); 187 validateFilters(); 188 registerRDFAnalysis(); 189 this.validate(); 190 } catch (final CmdLineException e) { 191 throw e; 192 } 193 194 } 195 196 private void registerRDFAnalysis() { 197 if (this.outputStatusType == StatusType.RDF) { 198 for (final TwitterPreprocessingMode<?> modes : this.modeOptionsOp) { 199 GeneralJSONRDF.registerRDFAnalysisProvider(modes.getAnalysisKey(), modes.rdfAnalysisProvider()); 200 } 201 } 202 } 203 204 private void validateFilters() { 205 for (final TwitterPreprocessingPredicate filter : this.postFilterOptionsOp) { 206 filter.validate(); 207 } 208 for (final TwitterPreprocessingPredicate filter : this.preFilterOptionsOp) { 209 filter.validate(); 210 } 211 } 212 213 private String getExtractUsageInfo() { 214 return "Preprocess tweets for bag of words analysis"; 215 } 216 217 /** 218 * @return an instance of the selected preprocessing mode 219 * @throws Exception 220 */ 221 public List<TwitterPreprocessingMode<?>> preprocessingMode() throws Exception { 222 if (veryLoud) { 223 System.out.println("Creating preprocessing modes"); 224 } 225 final ArrayList<TwitterPreprocessingMode<?>> modes = new ArrayList<TwitterPreprocessingMode<?>>(); 226 for (final TwitterPreprocessingModeOption modeOpt : this.modeOptions) { 227 modes.add(modeOpt.getOptions()); 228 } 229 return modes; 230 } 231 232 /** 233 * @return an instance of the selected output mode 234 */ 235 public TwitterOutputMode ouputMode() { 236 outputModeOptionOp.validate(this); 237 return outputModeOptionOp; 238 } 239 240 /** 241 * @return whether the options provided make sense 242 * @throws CmdLineException 243 */ 244 public abstract boolean validate() throws CmdLineException; 245 246 /** 247 * @param string 248 * print progress if we are not being quiet 249 */ 250 public void progress(String string) { 251 if (!quiet) { 252 System.out.print(string); 253 } 254 } 255 256 /** 257 * @return print some extra information 258 */ 259 public boolean veryLoud() { 260 return this.veryLoud; 261 } 262 263 /** 264 * @return the time to wait while analysing a tweet before it is skipped 265 * over 266 */ 267 public long getTimeBeforeSkip() { 268 return this.timeBeforeSkip; 269 } 270 271 /** 272 * @return the encoding 273 */ 274 public String getEncoding() { 275 return encoding; 276 } 277 278 /** 279 * Check the internal preprocessing filters and say whether a given status 280 * should be skipped 281 * 282 * @param twitterStatus 283 * @return whether to skip a status 284 */ 285 public boolean preProcessesSkip(USMFStatus twitterStatus) { 286 boolean skip = false; 287 for (final TwitterPreprocessingPredicate f : preFilterOptionsOp) { 288 skip = !f.test(twitterStatus); 289 if (skip) 290 break; 291 } 292 return skip; 293 } 294 295 /** 296 * Check the internal postprocessing filters and say whether a given status 297 * should be skipped 298 * 299 * @param twitterStatus 300 * @return whether to skip a status 301 */ 302 public boolean postProcessesSkip(USMFStatus twitterStatus) { 303 boolean skip = false; 304 for (final TwitterPreprocessingPredicate f : postFilterOptionsOp) { 305 skip = !f.test(twitterStatus); 306 if (skip) 307 break; 308 } 309 return skip; 310 } 311 312 /** 313 * Provides the functionality to convert to the required output format as 314 * specified by -ot 315 * 316 * @param twitterStatus 317 * @return the converted output 318 */ 319 public GeneralJSON convertToOutputFormat(USMFStatus twitterStatus) { 320 final GeneralJSON outInstance = TwitterStatusListUtils.newInstance(this.outputStatusType.type()); 321 outInstance.fromUSMF(twitterStatus); 322 return outInstance; 323 } 324 325 /** 326 * @return the input status type 327 */ 328 public StatusType getInputClass() { 329 return this.statusType; 330 } 331 332 /** 333 * @return the input status type 334 */ 335 public StatusType getOutputClass() { 336 return this.outputStatusType; 337 } 338}