diff --git a/learning.py b/learning.py index afc0caceb..e3d3ea05e 100644 --- a/learning.py +++ b/learning.py @@ -293,7 +293,7 @@ def sample(self): # ______________________________________________________________________________ -def PluralityLearner(dataset): +def PluralityLearner(dataset, size=None): """A very dumb algorithm: always pick the result that was most popular in the training data. Makes a baseline for comparison.""" most_popular = mode([e[dataset.target] for e in dataset.examples]) @@ -306,14 +306,14 @@ def predict(example): # ______________________________________________________________________________ -def NaiveBayesLearner(dataset, continuous=True): +def NaiveBayesLearner(dataset, size=None, continuous=True): if(continuous): - return NaiveBayesContinuous(dataset) + return NaiveBayesContinuous(dataset, size) else: - return NaiveBayesDiscrete(dataset) + return NaiveBayesDiscrete(dataset, size) -def NaiveBayesDiscrete(dataset): +def NaiveBayesDiscrete(dataset, size): """Just count how many times each value of each input attribute occurs, conditional on the target value. Count the different target values too.""" @@ -341,7 +341,7 @@ def class_probability(targetval): return predict -def NaiveBayesContinuous(dataset): +def NaiveBayesContinuous(dataset, size): """Count how many times each target value occurs. Also, find the means and deviations of input attribute values for each target value.""" means, deviations = dataset.find_means_and_deviations() @@ -426,7 +426,7 @@ def __repr__(self): # ______________________________________________________________________________ -def DecisionTreeLearner(dataset): +def DecisionTreeLearner(dataset, size=None): """[Figure 18.5]""" target, values = dataset.target, dataset.values @@ -905,49 +905,65 @@ def train_and_test(dataset, start, end): return train, val +def partition(dataset, fold, k): + num_examples = len(dataset.examples) + return train_and_test(dataset, fold * (num_examples / k), (fold + 1) * (num_examples / k)) + + def cross_validation(learner, size, dataset, k=10, trials=1): """Do k-fold cross_validate and return their mean. That is, keep out 1/k of the examples for testing on each of k runs. Shuffle the examples first; if trials>1, average over several shuffles. Returns Training error, Validataion error""" - if k is None: - k = len(dataset.examples) if trials > 1: trial_errT = 0 trial_errV = 0 + for t in range(trials): - errT, errV = cross_validation(learner, size, dataset, - k=10, trials=1) + errT, errV = cross_validation(learner, size, dataset, k) trial_errT += errT trial_errV += errV + return trial_errT / trials, trial_errV / trials else: fold_errT = 0 fold_errV = 0 - n = len(dataset.examples) + examples = dataset.examples for fold in range(k): random.shuffle(dataset.examples) - train_data, val_data = train_and_test(dataset, fold * (n / k), - (fold + 1) * (n / k)) - dataset.examples = train_data + training_set, validation_set = partition(dataset, fold, k) h = learner(dataset, size) + fold_errT += err_ratio(h, dataset, train_data) fold_errV += err_ratio(h, dataset, val_data) + # Reverting back to original once test is completed dataset.examples = examples + return fold_errT / k, fold_errV / k +def leave_one_out(learner, dataset, size=None): + """Leave one out cross-validation over the dataset.""" + return cross_validation(learner, size, dataset, k=len(dataset.examples)) + + +def converges(err_val): + """Check for convergence provided err_val has more than two values""" + return err_val >= 2 and isclose(err_val[-2], err_val[-1], rel_tol=1e-6) + + def cross_validation_wrapper(learner, dataset, k=10, trials=1): """[Fig 18.8] Return the optimal value of size having minimum error - on validataion set. + on validation set. err_train: A training error array, indexed by size err_val: A validataion error array, indexed by size """ - err_val = [] err_train = [] + err_val = [] + size = 1 while True: @@ -963,15 +979,15 @@ def cross_validation_wrapper(learner, dataset, k=10, trials=1): min_val = err_val[i] best_size = i i += 1 + err_val.append(errV) err_train.append(errT) - print(err_val) - size += 1 + if converges(err_val): + best_size = size + return learner(dataset, best_size) -def leave_one_out(learner, dataset, size=None): - """Leave one out cross-validation over the dataset.""" - return cross_validation(learner, size, dataset, k=len(dataset.examples)) + size += 1 def learningcurve(learner, dataset, trials=10, sizes=None): @@ -1096,14 +1112,14 @@ def ContinuousXor(n): # ______________________________________________________________________________ -def compare(algorithms=[PluralityLearner, NaiveBayesLearner, - NearestNeighborLearner, DecisionTreeLearner], - datasets=[iris, orings, zoo, restaurant, SyntheticRestaurant(20), - Majority(7, 100), Parity(7, 100), Xor(100)], - k=10, trials=1): +def compare(algorithms=[PluralityLearner, NaiveBayesLearner, NearestNeighborLearner, + DecisionTreeLearner], + datasets=[iris, orings, zoo, restaurant, SyntheticRestaurant(20), Majority(7, 100), + Parity(7, 100), Xor(100)], + k=10, size=3, trials=1): """Compare various learners on various datasets using cross-validation. Print results as a table.""" print_table([[a.__name__.replace('Learner', '')] + - [cross_validation(a, d, k, trials) for d in datasets] + [cross_validation(a, size, d, k, trials) for d in datasets] for a in algorithms], - header=[''] + [d.name[0:7] for d in datasets], numfmt='%.2f') + header=[''] + [d.name[0:7] for d in datasets], numfmt='{:.2f}') diff --git a/tests/test_learning.py b/tests/test_learning.py index 34346b7ec..89babe9cf 100644 --- a/tests/test_learning.py +++ b/tests/test_learning.py @@ -1,4 +1,3 @@ - import pytest import math from utils import DataFile @@ -7,7 +6,6 @@ rms_error, manhattan_distance, mean_boolean_error, mean_error) - def test_euclidean(): distance = euclidean_distance([1, 2], [3, 4]) assert round(distance, 2) == 2.83 @@ -149,6 +147,16 @@ def test_perceptron(): assert err_ratio(perceptron, iris) < 0.4 +def test_train_and_test(): + dataset = DataSet(name="iris") + start = 50 + end = 100 + + train_set, validation_set = train_and_test(dataset, start, end) + + assert len(train_set) == 100 + assert len(validation_set) == 50 + def test_random_weights(): min_value = -0.5 max_value = 0.5 @@ -157,4 +165,3 @@ def test_random_weights(): assert len(test_weights) == num_weights for weight in test_weights: assert weight >= min_value and weight <= max_value - \ No newline at end of file diff --git a/utils.py b/utils.py index 1757526ff..951eafece 100644 --- a/utils.py +++ b/utils.py @@ -317,6 +317,18 @@ def issequence(x): return isinstance(x, collections.abc.Sequence) +def format_table_value(value, numfmt): + if isnumber(value): + value = numfmt.format(value) + elif type(value) is tuple: + tmp = [] + for v in value: + tmp.append(format_table_value(v, numfmt)) + value = tuple(tmp) + + return value + + def print_table(table, header=None, sep=' ', numfmt='{}'): """Print a list of lists as a table, so that columns line up nicely. header, if specified, will be printed as the first row. @@ -328,7 +340,7 @@ def print_table(table, header=None, sep=' ', numfmt='{}'): if header: table.insert(0, header) - table = [[numfmt.format(x) if isnumber(x) else x for x in row] + table = [[format_table_value(x, numfmt) for x in row] for row in table] sizes = list( pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy