001/**
002 * Copyright (c) 2012, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.tools.twitter.options;
031
032import java.util.ArrayList;
033import java.util.List;
034
035import org.kohsuke.args4j.CmdLineException;
036import org.kohsuke.args4j.CmdLineParser;
037import org.kohsuke.args4j.Option;
038import org.kohsuke.args4j.ProxyOptionHandler;
039import org.openimaj.tools.InOutToolOptions;
040import org.openimaj.tools.twitter.modes.filter.TwitterPreprocessingFilterOption;
041import org.openimaj.tools.twitter.modes.filter.TwitterPreprocessingPredicate;
042import org.openimaj.tools.twitter.modes.output.TwitterOutputMode;
043import org.openimaj.tools.twitter.modes.output.TwitterOutputModeOption;
044import org.openimaj.tools.twitter.modes.preprocessing.TwitterPreprocessingMode;
045import org.openimaj.tools.twitter.modes.preprocessing.TwitterPreprocessingModeOption;
046import org.openimaj.twitter.GeneralJSON;
047import org.openimaj.twitter.GeneralJSONRDF;
048import org.openimaj.twitter.USMFStatus;
049import org.openimaj.twitter.collection.TwitterStatusListUtils;
050
051/**
052 * An abstract kind of twitter processing tool. Contains all the options generic
053 * to this kind of tool, not dependant on files or hadoop or whatever.
054 * 
055 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
056 * 
057 */
058public abstract class AbstractTwitterPreprocessingToolOptions extends InOutToolOptions {
059
060        @Option(
061                        name = "--mode",
062                        aliases = "-m",
063                        required = false,
064                        usage = "How should the tweets be processed.",
065                        handler = ProxyOptionHandler.class,
066                        multiValued = true)
067        List<TwitterPreprocessingModeOption> modeOptions = new ArrayList<TwitterPreprocessingModeOption>();
068        /**
069         * The preprocessing to perform
070         */
071        public List<TwitterPreprocessingMode<?>> modeOptionsOp = new ArrayList<TwitterPreprocessingMode<?>>();
072
073        @Option(
074                        name = "--pre-filter",
075                        aliases = "-prf",
076                        required = false,
077                        usage = "Define filters. Applied before other processing.",
078                        handler = ProxyOptionHandler.class,
079                        multiValued = true)
080        List<TwitterPreprocessingFilterOption> preFilterOptions = new ArrayList<TwitterPreprocessingFilterOption>();
081        /**
082         * The prefiltering to perform
083         */
084        public List<TwitterPreprocessingPredicate> preFilterOptionsOp = new ArrayList<TwitterPreprocessingPredicate>();
085
086        @Option(
087                        name = "--post-filter",
088                        aliases = "-pof",
089                        required = false,
090                        usage = "Define filters. Applied after other processing",
091                        handler = ProxyOptionHandler.class,
092                        multiValued = true)
093        List<TwitterPreprocessingFilterOption> postFilterOptions = new ArrayList<TwitterPreprocessingFilterOption>();
094        /**
095         * the postfiltering to perform
096         */
097        public List<TwitterPreprocessingPredicate> postFilterOptionsOp = new ArrayList<TwitterPreprocessingPredicate>();
098        //
099        @Option(
100                        name = "--encoding",
101                        aliases = "-e",
102                        required = false,
103                        usage = "The outputstreamwriter's text encoding",
104                        metaVar = "STRING")
105        String encoding = "UTF-8";
106
107        @Option(
108                        name = "--output-mode",
109                        aliases = "-om",
110                        required = false,
111                        usage = "How should the analysis be outputed.",
112                        handler = ProxyOptionHandler.class)
113        TwitterOutputModeOption outputModeOption = TwitterOutputModeOption.APPEND;
114        TwitterOutputMode outputModeOptionOp = TwitterOutputModeOption.APPEND.getOptions();
115
116        @Option(
117                        name = "--n-tweets",
118                        aliases = "-n",
119                        required = false,
120                        usage = "How many tweets from the input should this be applied to.",
121                        handler = ProxyOptionHandler.class)
122        int nTweets = -1;
123
124        @Option(name = "--quiet", aliases = "-q", required = false, usage = "Control the progress messages.")
125        boolean quiet = false;
126
127        @Option(name = "--verbose", aliases = "-v", required = false, usage = "Be very loud (overrides queit)")
128        boolean veryLoud = false;
129
130        @Option(
131                        name = "--time-before-skip",
132                        aliases = "-t",
133                        required = false,
134                        usage = "Time to wait before skipping an entry")
135        long timeBeforeSkip = 0;
136
137        /**
138         * the status type to take as input
139         */
140        @Option(
141                        name = "--input-type",
142                        aliases = "-it",
143                        required = false,
144                        usage = "The type of social media message being consumed")
145        public StatusType statusType = StatusType.TWITTER;
146
147        /**
148         * the status type to output
149         */
150        @Option(name = "--output-type", aliases = "-ot", required = false, usage = "How to output, defaults to USMF")
151        public StatusType outputStatusType = StatusType.USMF;
152
153        private String[] args;
154
155        /**
156         * @param args
157         *            the arguments, prepared using the prepare method
158         * @param prepare
159         *            whether prepare should be called now or later
160         */
161        public AbstractTwitterPreprocessingToolOptions(String[] args, boolean prepare) throws CmdLineException{
162                this.args = args;
163                if (prepare)
164                        this.prepare();
165        }
166
167        /**
168         * @param args
169         *            the arguments, prepared using the prepare method
170         */
171        public AbstractTwitterPreprocessingToolOptions(String[] args) throws CmdLineException{
172                this(args, true);
173        }
174
175        /**
176         * prepare the tool for running
177         */
178        public void prepare() throws CmdLineException{
179                final CmdLineParser parser = new CmdLineParser(this);
180                try {
181                        if (veryLoud && quiet) {
182                                quiet = false;
183                                veryLoud = true;
184                        }
185                        parser.parseArgument(args);
186                        InOutToolOptions.prepareMultivaluedArgument(modeOptions);
187                        validateFilters();
188                        registerRDFAnalysis();
189                        this.validate();
190                } catch (final CmdLineException e) {
191                        throw e;
192                }
193
194        }
195
196        private void registerRDFAnalysis() {
197                if (this.outputStatusType == StatusType.RDF) {
198                        for (final TwitterPreprocessingMode<?> modes : this.modeOptionsOp) {
199                                GeneralJSONRDF.registerRDFAnalysisProvider(modes.getAnalysisKey(), modes.rdfAnalysisProvider());
200                        }
201                }
202        }
203
204        private void validateFilters() {
205                for (final TwitterPreprocessingPredicate filter : this.postFilterOptionsOp) {
206                        filter.validate();
207                }
208                for (final TwitterPreprocessingPredicate filter : this.preFilterOptionsOp) {
209                        filter.validate();
210                }
211        }
212
213        private String getExtractUsageInfo() {
214                return "Preprocess tweets for bag of words analysis";
215        }
216
217        /**
218         * @return an instance of the selected preprocessing mode
219         * @throws Exception
220         */
221        public List<TwitterPreprocessingMode<?>> preprocessingMode() throws Exception {
222                if (veryLoud) {
223                        System.out.println("Creating preprocessing modes");
224                }
225                final ArrayList<TwitterPreprocessingMode<?>> modes = new ArrayList<TwitterPreprocessingMode<?>>();
226                for (final TwitterPreprocessingModeOption modeOpt : this.modeOptions) {
227                        modes.add(modeOpt.getOptions());
228                }
229                return modes;
230        }
231
232        /**
233         * @return an instance of the selected output mode
234         */
235        public TwitterOutputMode ouputMode() {
236                outputModeOptionOp.validate(this);
237                return outputModeOptionOp;
238        }
239
240        /**
241         * @return whether the options provided make sense
242         * @throws CmdLineException
243         */
244        public abstract boolean validate() throws CmdLineException;
245
246        /**
247         * @param string
248         *            print progress if we are not being quiet
249         */
250        public void progress(String string) {
251                if (!quiet) {
252                        System.out.print(string);
253                }
254        }
255
256        /**
257         * @return print some extra information
258         */
259        public boolean veryLoud() {
260                return this.veryLoud;
261        }
262
263        /**
264         * @return the time to wait while analysing a tweet before it is skipped
265         *         over
266         */
267        public long getTimeBeforeSkip() {
268                return this.timeBeforeSkip;
269        }
270
271        /**
272         * @return the encoding
273         */
274        public String getEncoding() {
275                return encoding;
276        }
277
278        /**
279         * Check the internal preprocessing filters and say whether a given status
280         * should be skipped
281         * 
282         * @param twitterStatus
283         * @return whether to skip a status
284         */
285        public boolean preProcessesSkip(USMFStatus twitterStatus) {
286                boolean skip = false;
287                for (final TwitterPreprocessingPredicate f : preFilterOptionsOp) {
288                        skip = !f.test(twitterStatus);
289                        if (skip)
290                                break;
291                }
292                return skip;
293        }
294
295        /**
296         * Check the internal postprocessing filters and say whether a given status
297         * should be skipped
298         * 
299         * @param twitterStatus
300         * @return whether to skip a status
301         */
302        public boolean postProcessesSkip(USMFStatus twitterStatus) {
303                boolean skip = false;
304                for (final TwitterPreprocessingPredicate f : postFilterOptionsOp) {
305                        skip = !f.test(twitterStatus);
306                        if (skip)
307                                break;
308                }
309                return skip;
310        }
311
312        /**
313         * Provides the functionality to convert to the required output format as
314         * specified by -ot
315         * 
316         * @param twitterStatus
317         * @return the converted output
318         */
319        public GeneralJSON convertToOutputFormat(USMFStatus twitterStatus) {
320                final GeneralJSON outInstance = TwitterStatusListUtils.newInstance(this.outputStatusType.type());
321                outInstance.fromUSMF(twitterStatus);
322                return outInstance;
323        }
324        
325        /**
326         * @return the input status type
327         */
328        public StatusType  getInputClass() {
329                return this.statusType;
330        }
331        
332        /**
333         * @return the input status type
334         */
335        public StatusType getOutputClass() {
336                return this.outputStatusType;
337        }
338}