001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.ml.dataset;
031
032import gov.sandia.cognition.math.matrix.Vector;
033import gov.sandia.cognition.math.matrix.VectorFactory;
034
035import java.io.BufferedReader;
036import java.io.InputStreamReader;
037import java.util.Arrays;
038import java.util.HashSet;
039import java.util.Set;
040
041import org.apache.log4j.Logger;
042import org.openimaj.data.dataset.Dataset;
043import org.openimaj.data.dataset.ListBackedDataset;
044import org.openimaj.data.dataset.ListDataset;
045import org.openimaj.data.dataset.MapBackedDataset;
046import org.openimaj.experiment.annotations.DatasetDescription;
047
048/**
049 * A {@link Dataset} instance of the standard wine clustering experiment found here:
050 * 
051 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
052 *
053 */
054@DatasetDescription(
055                name = "Wine Data Set",
056                description = "" +
057                                "These data are the results of a chemical analysis of wines grown in the same region in " +
058                                "Italy but derived from three different cultivars. The analysis determined the quantities " +
059                                "of 13 constituents found in each of the three types of wines."+
060                                ""+
061                                "I think that the initial data set had around 30 variables, but for some reason I only have " +
062                                "the 13 dimensional version. I had a list of what the 30 or so variables were, but a.) I lost" +
063                                " it, and b.), I would not know which 13 variables are included in the set."+
064                                ""+
065                                "The attributes are (dontated by Riccardo Leardi, riclea '@' anchem.unige.it )"+
066                                "1) Alcohol"+
067                                "2) Malic acid"+
068                                "3) Ash"+
069                                "4) Alcalinity of ash"+
070                                "5) Magnesium"+
071                                "6) Total phenols"+
072                                "7) Flavanoids"+
073                                "8) Nonflavanoid phenols"+
074                                "9) Proanthocyanins"+
075                                "10)Color intensity"+
076                                "11)Hue"+
077                                "12)OD280/OD315 of diluted wines"+
078                                "13)Proline"+
079                                ""+
080                                "In a classification context, this is a well posed problem with \"well behaved\" class structures." +
081                                " A good data set for first testing of a new classifier, but not very challenging. ",
082                creator = "Forina, M. et al, PARVUS - ",
083                url = "http://archive.ics.uci.edu/ml/datasets/Wine",
084                downloadUrls = {
085                                "http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
086                })
087public class WineDataset extends MapBackedDataset<Integer, ListDataset<double[]>, double[]>{
088        final static Logger logger = Logger.getLogger(WineDataset.class);
089        
090        /**
091         * Loads the wine dataset, mean centres the dataset
092         * @param clusters valid clusters, if empty all clusters are chosen
093         */
094        public WineDataset(Integer ... clusters) {
095                this(true,clusters);
096        }
097        
098        /**
099         * Loads the wine dataset from wine.data
100         * @param normalise whether to mean center the dataset
101         * @param clusters valid clusters, if empty all clusters are chosen
102         */
103        public WineDataset(boolean normalise, Integer... clusters) {
104                BufferedReader br = new BufferedReader(
105                                new InputStreamReader(WineDataset.class.getResourceAsStream("wine.data")));
106                String line = null;
107                Vector mean = null;
108                Set<Integer> clusterSet = null;
109                if(clusters.length!=0){
110                        clusterSet = new HashSet<Integer>();
111                        clusterSet.addAll(Arrays.asList(clusters));
112                }
113                
114                try {
115                        while((line = br.readLine())!=null){
116                                String[] parts = line.split(",");
117                                int cluster = Integer.parseInt(parts[0].trim());
118                                if(clusterSet!=null && !clusterSet.contains(cluster)) continue;
119                                double[] data = new double[parts.length-1];
120                                for (int i = 0; i < data.length; i++) {
121                                        data[i] = Double.parseDouble(parts[i+1]);
122                                }
123                                
124                                ListDataset<double[]> ds = this.get(cluster);
125                                if(ds == null) this.put(cluster, ds = new ListBackedDataset<double[]>());
126                                ds.add(data);
127                                Vector copyArray = VectorFactory.getDefault().copyArray(data);
128                                if(mean == null){
129                                        mean = copyArray.clone();
130                                }
131                                else{
132                                        mean.plusEquals(copyArray);
133                                }
134                        }
135                        mean.scaleEquals(1./this.numInstances());
136                        if(normalise) {
137                                normalise(mean);
138                        }
139                } catch (Exception e) {
140                        logger.error("Wine dataset failed to load",e);
141                }
142        }
143
144        private void normalise(Vector mean) {
145                for (double[] data : this) {
146                        for (int i = 0; i < data.length; i++) {
147                                data[i] -= mean.getElement(i);
148                        }
149                }
150        }
151}