001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.ml.dataset; 031 032import gov.sandia.cognition.math.matrix.Vector; 033import gov.sandia.cognition.math.matrix.VectorFactory; 034 035import java.io.BufferedReader; 036import java.io.InputStreamReader; 037import java.util.Arrays; 038import java.util.HashSet; 039import java.util.Set; 040 041import org.apache.log4j.Logger; 042import org.openimaj.data.dataset.Dataset; 043import org.openimaj.data.dataset.ListBackedDataset; 044import org.openimaj.data.dataset.ListDataset; 045import org.openimaj.data.dataset.MapBackedDataset; 046import org.openimaj.experiment.annotations.DatasetDescription; 047 048/** 049 * A {@link Dataset} instance of the standard wine clustering experiment found here: 050 * 051 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 052 * 053 */ 054@DatasetDescription( 055 name = "Wine Data Set", 056 description = "" + 057 "These data are the results of a chemical analysis of wines grown in the same region in " + 058 "Italy but derived from three different cultivars. The analysis determined the quantities " + 059 "of 13 constituents found in each of the three types of wines."+ 060 ""+ 061 "I think that the initial data set had around 30 variables, but for some reason I only have " + 062 "the 13 dimensional version. I had a list of what the 30 or so variables were, but a.) I lost" + 063 " it, and b.), I would not know which 13 variables are included in the set."+ 064 ""+ 065 "The attributes are (dontated by Riccardo Leardi, riclea '@' anchem.unige.it )"+ 066 "1) Alcohol"+ 067 "2) Malic acid"+ 068 "3) Ash"+ 069 "4) Alcalinity of ash"+ 070 "5) Magnesium"+ 071 "6) Total phenols"+ 072 "7) Flavanoids"+ 073 "8) Nonflavanoid phenols"+ 074 "9) Proanthocyanins"+ 075 "10)Color intensity"+ 076 "11)Hue"+ 077 "12)OD280/OD315 of diluted wines"+ 078 "13)Proline"+ 079 ""+ 080 "In a classification context, this is a well posed problem with \"well behaved\" class structures." + 081 " A good data set for first testing of a new classifier, but not very challenging. ", 082 creator = "Forina, M. et al, PARVUS - ", 083 url = "http://archive.ics.uci.edu/ml/datasets/Wine", 084 downloadUrls = { 085 "http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data" 086 }) 087public class WineDataset extends MapBackedDataset<Integer, ListDataset<double[]>, double[]>{ 088 final static Logger logger = Logger.getLogger(WineDataset.class); 089 090 /** 091 * Loads the wine dataset, mean centres the dataset 092 * @param clusters valid clusters, if empty all clusters are chosen 093 */ 094 public WineDataset(Integer ... clusters) { 095 this(true,clusters); 096 } 097 098 /** 099 * Loads the wine dataset from wine.data 100 * @param normalise whether to mean center the dataset 101 * @param clusters valid clusters, if empty all clusters are chosen 102 */ 103 public WineDataset(boolean normalise, Integer... clusters) { 104 BufferedReader br = new BufferedReader( 105 new InputStreamReader(WineDataset.class.getResourceAsStream("wine.data"))); 106 String line = null; 107 Vector mean = null; 108 Set<Integer> clusterSet = null; 109 if(clusters.length!=0){ 110 clusterSet = new HashSet<Integer>(); 111 clusterSet.addAll(Arrays.asList(clusters)); 112 } 113 114 try { 115 while((line = br.readLine())!=null){ 116 String[] parts = line.split(","); 117 int cluster = Integer.parseInt(parts[0].trim()); 118 if(clusterSet!=null && !clusterSet.contains(cluster)) continue; 119 double[] data = new double[parts.length-1]; 120 for (int i = 0; i < data.length; i++) { 121 data[i] = Double.parseDouble(parts[i+1]); 122 } 123 124 ListDataset<double[]> ds = this.get(cluster); 125 if(ds == null) this.put(cluster, ds = new ListBackedDataset<double[]>()); 126 ds.add(data); 127 Vector copyArray = VectorFactory.getDefault().copyArray(data); 128 if(mean == null){ 129 mean = copyArray.clone(); 130 } 131 else{ 132 mean.plusEquals(copyArray); 133 } 134 } 135 mean.scaleEquals(1./this.numInstances()); 136 if(normalise) { 137 normalise(mean); 138 } 139 } catch (Exception e) { 140 logger.error("Wine dataset failed to load",e); 141 } 142 } 143 144 private void normalise(Vector mean) { 145 for (double[] data : this) { 146 for (int i = 0; i < data.length; i++) { 147 data[i] -= mean.getElement(i); 148 } 149 } 150 } 151}