ML programs

1.Implement and demonstrate FIND-S alorithm for finding the mot spcific hypotheis based on a given set of training data samples. Read the training data from a .CSV file.

import csv

with open('C:/xampp/htdocs/girish/PlayTennis.csv', 'r') as f:

    reader = csv.reader(f)

    your_list = list(reader)

    h = [['0', '0', '0', '0', '0', '0']]

    for i in your_list:

        print(i)

        if i[-1] == "True":

            j = 0

            for x in i:

                if x != "True":

                    if x != h[0][j] and h[0][j] == '0':

                        h[0][j] = x

                    elif x != h[0][j] and h[0] * [j] != '0':

                        h[0][j] = '?'

                    else:

                        pass

                    j = j + 1

print("Most specific hypothesis is")

print(h)

output:

['Outlook', 'Temperature', 'Humidity', 'Wind', 'Play Tennis']
['Sunny', 'Hot', 'High', 'Weak', 'No']
['Sunny', 'Hot', 'High', 'Strong', 'No']
['Overcast', 'Hot', 'High', 'Weak', 'Yes']
['Rain', 'Mild', 'High', 'Weak', 'Yes']
['Rain', 'Cool', 'Normal', 'Weak', 'Yes']
['Rain', 'Cool', 'Normal', 'Strong', 'No']
['Overcast', 'Cool', 'Normal', 'Strong', 'Yes']
['Sunny', 'Mild', 'High', 'Weak', 'No']
['Sunny', 'Cool', 'Normal', 'Weak', 'Yes']
['Rain', 'Mild', 'Normal', 'Weak', 'Yes']
['Sunny', 'Mild', 'Normal', 'Strong', 'Yes']
['Overcast', 'Mild', 'High', 'Strong', 'Yes']
['Overcast', 'Hot', 'Normal', 'Weak', 'Yes']
['Rain', 'Mild', 'High', 'Strong', 'No']
Most specific hypothesis is
[['0', '0', '0', '0', '0', '0']]

2.For a given set of training data examples stored in a .CSV file, implement and demonstrate the Candidate-Elimination algorithmto output a description of the set of all hypotheses consistent with the training examples.

import numpy as np
import pandas as pd
import csv
file = open('ai2.csv')
data = list(csv.reader(file))[:]
concepts = []
target = []
for i in data:
 concepts.append(i[:-1])
 target.append(i[-1])
specific_h = ['O']*len(concepts[0])
general_h = [['?' for _ in range(len(specific_h))] for _ in range(len(specific_h))]
for i, instance in enumerate(concepts):
 if target[i] == "Yes":
  for x in range(len(specific_h)):
   if specific_h[x] == 'O':
    specific_h[x] = instance[x]
   elif instance[x] != specific_h[x]:
    specific_h[x] = '?'
    general_h[x][x] = '?'
 if target[i] == "No":
  for x in range(len(specific_h)):
   general_h[x][x] = specific_h[x] if instance[x] != specific_h[x] else '?'
indices = [i for i, val in enumerate(general_h) if val == ['?', '?', '?', '?', '?', '?']]
for _ in indices:
 general_h.remove(['?', '?', '?', '?', '?', '?'])
print("Final Specific : ", specific_h, sep = '\n')
print("Final General : ", general_h, sep = '\n')


Output
Final Specific :
['Sunny', 'Warm', '?', 'Strong', '?', '?']
Final General :
[['Sunny', '?', '?', '?', '?', '?'], ['?', 'Warm', '?', '?', '?', '?']]

3)Demonstrate working of decision tree based ID3 algorithm .Use an appropriate data set for building the decision tree and apply this knowledge to classify a new sample.

import pandas as pd
from pandas import DataFrame

df_tennis = pd.DataFrame(data=pd.read_csv('C:/Users/Akshatha/PycharmProjects/pythonPrograms/playtennis.csv'))


def entropy(probs):
    import math
    return sum([-prob * math.log(prob, 2) for prob in probs])


def entropy_of_list(a_list):
    from collections import Counter
    cnt = Counter(x for x in a_list)
    print("No and Yes Classes:", a_list.name, cnt)
    num_instances = len(a_list) * 1.0
    probs = [x / num_instances for x in cnt.values()]
    return entropy(probs)


total_entropy = entropy_of_list(df_tennis['playtennis'])
print("Entropy of given PlayTennis Data Set:", total_entropy)


def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
    print("Information Gain Calculation of ", split_attribute_name)
    df_split = df.groupby(split_attribute_name)
    for name, group in df_split:
        print(name)
        print(group)
    nobs = len(df.index) * 1.0
    df_agg_ent = df_split.agg({target_attribute_name: [entropy_of_list, lambda x:len(x) / nobs]})[target_attribute_name]
    df_agg_ent.columns = ['Entropy', 'PropObservations']
    print(df_agg_ent)
    new_entropy = sum(df_agg_ent['Entropy'] * df_agg_ent['PropObservations'])
    old_entropy = entropy_of_list(df[target_attribute_name])
    return old_entropy - new_entropy


print("\n Info-gain for Outlook is :'+str( information_gain(df_tennis, 'outlook','playtennis')),\n")
print("\n Info-gain for Humidity is: ' + str( information_gain(df_tennis,'humidity','playtennis')),\n")
print("\n Info-gain for Wind is:' + str( information_gain(df_tennis, 'wind','playtennis')),\n")
print("\n Info-gain for Temperature is:' + str(information_gain(df_tennis, 'temperature','playtennis')),\
n")


def id3(df, target_attribute_name, attribute_names, default_class=None):
    from collections import Counter
    cnt = Counter(x for x in df[target_attribute_name])
    if len(cnt) == 1:
        return next(iter(cnt))
    elif df.empty or (not attribute_names):
        return default_class
    else:
        default_class = max(cnt.keys())
        gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names]
        index_of_max = gainz.index(max(gainz))
        best_attr = attribute_names[index_of_max]
        tree = {best_attr: {}}
        remaining_attribute_names = [i for i in attribute_names if i != best_attr]
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset, target_attribute_name, remaining_attribute_names, default_class)
            tree[best_attr][attr_val] = subtree
        return tree


attribute_names = list(df_tennis.columns)
print("List of Attributes:", attribute_names)
attribute_names.remove('playtennis')
print("Predicting Attributes:", attribute_names)
from pprint import pprint

tree = id3(df_tennis, 'playtennis', attribute_names)
print("\n\nThe Resultant Decision Tree is :\n")
pprint(tree)


def classify(instance, tree, default=None):
    attribute = next(iter(tree))
    if instance[attribute] in tree[attribute].keys():
        result = tree[attribute][instance[attribute]]
        if isinstance(result, dict):
            return classify(instance, result)
        else:
            return result  # this is a label
    else:
        return default


df_tennis['predicted'] = df_tennis.apply(classify, axis=1, args=(tree, 'no'))
print('Accuracy is:' + str(sum(df_tennis['playtennis'] == df_tennis['predicted']) / (1.0 * len(df_tennis.index))
                           ))
df_tennis[['playtennis', 'predicted']]
training_data = df_tennis.loc[1:-4]
test_data = df_tennis.loc[-4:]
train_tree = id3(training_data, 'playtennis', attribute_names)
test_data['predicted2'] = test_data.apply(
    # <----test_data source
    classify,
        axis=1,
    args=(train_tree, 'yes'))  # <---- train_data tree
print('\n\n Accuracy is: ' + str(
    sum(test_data['playtennis'] == test_data['predicted2']) / (1.0 * len(test_data.index))))

OUTPUT:
output is vey huge go n excecute😂

4.BuildanArtificial Neural Network by inmplementing the Backpropagation algoríthm and test the sanne using appropriate data sets.

import numpy as np
X = np.array(([2, 9], [1, 5], [3, 6]), dtype=float)
y= np.array(([92], [86], [89]), dtype=float)
X = X/np.amax(X,axis=0)
y = y/100
def sigmoid (x):
    return 1/(1+ np.exp(-x))
def derivatives_sigmoid(x):
    return x* (1-x)
epoch=7000
lr=0.1
inputlayer_neurons = 2
hiddenlayer_neurons=3
output_neurons = 1
wh=np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neurons))
bh=np.random.uniform(size=(1,hiddenlayer_neurons))
wout=np.random.uniform(size=(hiddenlayer_neurons,output_neurons))
bout=np.random.uniform(size=(1,output_neurons))
for i in range(epoch):
    hinp1 = np.dot(X, wh)
    hinp = hinp1 + bh
    hlayer_act=sigmoid(hinp)
    outinp1 = np.dot(hlayer_act, wout)
    outinp=outinp1 + bout
    output=sigmoid(outinp)
EO = y - output
outgrad = derivatives_sigmoid(output)
d_output = EO * outgrad
EH = d_output.dot(wout.T)
hiddengrad = derivatives_sigmoid(hlayer_act)
d_hiddenlayer = EH * hiddengrad
wout += hlayer_act.T.dot(d_output) * lr
print("Input: \n" + str(X))
print("Actual Output: \n" + str(y))
print("Predicted Output: \n", output)

OUTPUT:
Input: 
[[0.66666667 1.        ]
 [0.33333333 0.55555556]
 [1.         0.66666667]]
Actual Output: 
[[0.92]
 [0.86]
 [0.89]]
Predicted Output: 
 [[0.88071043]
 [0.86746983]
 [0.88574158]]

5.WriteAÁ progran to implement the naive Bayesian classifier for a sample dafa set stored as a .CSVfile. Compute the accuracy few of the classifier, considering test data sets.

import csv
import random
import math


def loadcsv(filename):
    lines = csv.reader(open(filename, "r"));
    dataset = list(lines)
    for i in range(len(dataset)):
        # converting strings into numbers for processing
        dataset[i] = [float(x) for x in dataset[i]]

    return dataset


def splitdataset(dataset, splitratio):
    # 67% training size
    trainsize = int(len(dataset) * splitratio);
    trainset = []
    copy = list(dataset);
    while len(trainset) < trainsize:
        # generate indices for the dataset list randomly to pick ele for training data
        index = random.randrange(len(copy));
        trainset.append(copy.pop(index))
    return [trainset, copy]


def separatebyclass(dataset):
    separated = {}  # dictionary of classes 1 and 0
    # creates a dictionary of classes 1 and 0 where the values are
    # the instances belonging to each class
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated


def mean(numbers):
    return sum(numbers) / float(len(numbers))


def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
    return math.sqrt(variance)


def summarize(dataset):  # creates a dictionary of classes
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)];
    del summaries[-1]  # excluding labels +ve or -ve
    return summaries


def summarizebyclass(dataset):
    separated = separatebyclass(dataset);
    # print(separated)
    summaries = {}
    for classvalue, instances in separated.items():
        # for key,value in dic.items()
        # summaries is a dic of tuples(mean,std) for each class value
        summaries[classvalue] = summarize(instances)  # summarize is used to cal to mean and std
    return summaries


def calculateprobability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent


def calculateclassprobabilities(summaries, inputvector):
    probabilities = {}  # probabilities contains the all prob of all class of test data
    for classvalue, classsummaries in summaries.items():  # class and attribute information as mean and sd
        probabilities[classvalue] = 1
        for i in range(len(classsummaries)):
            mean, stdev = classsummaries[i]  # take mean and sd of every attribute for class 0 and 1 seperaely
            x = inputvector[i]  # testvector's first attribute
            probabilities[classvalue] *= calculateprobability(x, mean, stdev);  # use normal dist
    return probabilities


def predict(summaries, inputvector):  # training and test data is passed
    probabilities = calculateclassprobabilities(summaries, inputvector)
    bestLabel, bestProb = None, -1
    for classvalue, probability in probabilities.items():  # assigns that class which has he highest prob
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classvalue
    return bestLabel


def getpredictions(summaries, testset):
    predictions = []
    for i in range(len(testset)):
        result = predict(summaries, testset[i])
        predictions.append(result)
    return predictions


def getaccuracy(testset, predictions):
    correct = 0
    for i in range(len(testset)):
        if testset[i][-1] == predictions[i]:
            correct += 1
    return (correct / float(len(testset))) * 100.0


def main():
    filename = 'naivedata.csv'
    splitratio = 0.67
    dataset = loadcsv(filename);

    trainingset, testset = splitdataset(dataset, splitratio)
    print('Split {0} rows into train={1} and test={2} rows'.format(len(dataset), len(trainingset), len(testset)))
    # prepare model
    summaries = summarizebyclass(trainingset);
    # print(summaries)
    # test model
    predictions = getpredictions(summaries, testset)  # find the predictions of test data with the training data
    accuracy = getaccuracy(testset, predictions)
    print('Accuracy of the classifier is : {0}%'.format(accuracy))


main()

OUTPUT:

Split 768 rows into train=514 and test=254 rows
Accuracy of the classifier is : 77.16535433070865%

8.Demonstrate working of decision tree based ID3 algorithm .Use an appropriate data set for building the decision tree and apply this knowledge to classify a new sample.

import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
iris = datasets.load_iris()
X = pd.DataFrame(iris.data)
X.columns = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width']
y = pd.DataFrame(iris.target)
y.columns = ['Targets']
model = KMeans(n_clusters=3)
model.fit(X)
plt.figure(figsize=(14,14))
colormap = np.array(['red','lime','black'])
plt.subplot(2,2,1)
plt.scatter(X.Petal_Length,X.Petal_Width, c=colormap[y.Targets], s = 40)
plt.title('Real Clusters')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
plt.subplot(2,2,2)
plt.scatter(X.Petal_Length,X.Petal_Width, c=colormap[model.labels_], s = 40)
plt.title('K-Means Clustering')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
scaler.fit(X)
xsa = scaler.transform(X)
xs = pd.DataFrame(xsa,columns=X.columns)
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3)
gmm.fit(xs)
gmm_y=gmm.predict(xs)
plt.subplot(2,2,3)
plt.scatter(X.Petal_Length,X.Petal_Width, c=colormap[gmm_y], s = 40)
plt.title('GMM Clustering')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
print('Observation: The GMM using EM algorithm based clustering matched the true labels more closely than the Kmeans.')
plt.show()

output:

Observation: The GMM using EM algorithm based clustering matched the true labels more closely than the Kmeans.

9.Write arprogram to m implement k-Nearest Neighbour algorithm 1o classify the iris dataset. Print both correct and wrong predictions. Java/Python ML library classes can be used for this problem.

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets

iris = datasets.load_iris()
print("iris datasets loaded")
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.1)
print("dataset is training and testing ....")
print("size of training data and its label", x_train.shape, y_train.shape)
print("size of training data and its label", x_test.shape, y_test.shape)
for i in range(len(iris.target_names)):
    print("label", i, "-", str(iris.target_names[i]))
    classifier = KNeighborsClassifier(n_neighbors=1)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    print("Results of classification using K-NN K=1")
    for r in range(0, len(x_test)):
        print("Sample: ", str(x_test[r]), "Actual-Label: ", str(y_test[r]), "Predicted-Label: ", str(y_pred[r]))
        print("Classification Accuracy: ", classifier.score(x_test, y_test))

OUTPUT:

iris datasets loaded
dataset is training and testing ....
size of training data and its label (135, 4) (135,)
size of training data and its label (15, 4) (15,)
label 0 - setosa
Results of classification using K-NN K=1
Sample:  [5.  2.3 3.3 1. ] Actual-Label:  1 Predicted-Label:  1
Classification Accuracy:  0.9333333333333333
Sample:  [6.1 3.  4.6 1.4] Actual-Label:  1 Predicted-Label:  1
Classification Accuracy:  0.9333333333333333
Sample:  [5.8 2.7 5.1 1.9] Actual-Label:  2 Predicted-Label:  2
Classification Accuracy:  0.9333333333333333
Sample:  [6.3 2.5 4.9 1.5] Actual-Label:  1 Predicted-Label:  2
Classification Accuracy:  0.9333333333333333
Sample:  [6.7 2.5 5.8 1.8] Actual-Label:  2 Predicted-Label:  2
Classification Accuracy:  0.9333333333333333
Sample:  [6.7 3.1 4.4 1.4] Actual-Label:  1 Predicted-Label:  1
Classification Accuracy:  0.9333333333333333
Sample:  [5.5 2.6 4.4 1.2] Actual-Label:  1 Predicted-Label:  1
Classification Accuracy:  0.9333333333333333
Sample:  [6.  2.2 4.  1. ] Actual-Label:  1 Predicted-Label:  1
Classification Accuracy:  0.9333333333333333
Sample:  [6.4 2.8 5.6 2.1] Actual-Label:  2 Predicted-Label:  2
Classification Accuracy:  0.9333333333333333
Sample:  [5.6 2.9 3.6 1.3] Actual-Label:  1 Predicted-Label:  1
Classification Accuracy:  0.9333333333333333
Sample:  [6.9 3.1 5.1 2.3] Actual-Label:  2 Predicted-Label:  2
Classification Accuracy:  0.9333333333333333
Sample:  [5.2 2.7 3.9 1.4] Actual-Label:  1 Predicted-Label:  1
Classification Accuracy:  0.9333333333333333
Sample:  [7.1 3.  5.9 2.1] Actual-Label:  2 Predicted-Label:  2
Classification Accuracy:  0.9333333333333333
Sample:  [6.3 3.4 5.6 2.4] Actual-Label:  2 Predicted-Label:  2
Classification Accuracy:  0.9333333333333333
Sample:  [6.1 2.8 4.  1.3] Actual-Label:  1 Predicted-Label:  1
Classification Accuracy:  0.9333333333333333

10.Intplement the non-parametric Locally Weighted Regression algorithm fit data points. Select in order 10 appropriate dataset for your experiment and draw graphs.

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


def kernel(point,xmat,k):
    m,n = np.shape(xmat)
    weights = np.mat(np.eye((m)))
    for j in range(m):
        diff = point - X[j]
        weights[j,j] = np.exp(diff * diff.T / (-2.0 * k ** 2))
    return weights


def localWeight(point,xmat,ymat,k):
    wei = kernel(point, xmat, k)
    W = (X.T * (wei * X)).I * (X.T * (wei * ymat.T))
    return W


def localWeightRegression(xmat, ymat, k):
    m, n = np.shape(xmat)
    ypred = np.zeros(m)
    for i in range(m):
        ypred[i] = xmat[i] * localWeight(xmat[i], xmat, ymat, k)
    return ypred


def graphPlot(X, ypred):
    sortindex = X[:, 1].argsort(0)
    xsort = X[sortindex][:, 0]
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.scatter(bill, tip, color='green')
    ax.plot(xsort[:, 1], ypred[sortindex], color='red', linewidth=5)
    plt.xlabel('Total bill')
    plt.ylabel('Tip')
    plt.show();


data = pd.read_csv('data10_tips.csv')
bill = np.array(data.total_bill)
tip = np.array(data.tip)
mbill = np.mat(bill)
mtip = np.mat(tip)
m = np.shape(mbill)[1]
one = np.mat(np.ones(m))
X = np.hstack((one.T, mbill.T))
ypred = localWeightRegression(X,mtip,0.5)
graphPlot(X, ypred)

OUTPUT:
Regession with parameter k = 3 

Regession with parameter k = 9




Comments

Popular posts from this blog

big data 8

big data 5