ML programs

1.Implement and demonstrate FIND-S alorithm for finding the mot spcific hypotheis based on a given set of training data samples. Read the training data from a .CSV file.

import csv

with open('C:/xampp/htdocs/girish/PlayTennis.csv', 'r') as f:

reader = csv.reader(f)

your_list = list(reader)

h = [['0', '0', '0', '0', '0', '0']]

for i in your_list:

print(i)

if i[-1] == "True":

j = 0

for x in i:

if x != "True":

if x != h[0][j] and h[0][j] == '0':

h[0][j] = x

elif x != h[0][j] and h[0] * [j] != '0':

h[0][j] = '?'

else:

pass

j = j + 1

print("Most specific hypothesis is")

print(h)

output:

['Outlook', 'Temperature', 'Humidity', 'Wind', 'Play Tennis']

['Sunny', 'Hot', 'High', 'Weak', 'No']

['Sunny', 'Hot', 'High', 'Strong', 'No']

['Overcast', 'Hot', 'High', 'Weak', 'Yes']

['Rain', 'Mild', 'High', 'Weak', 'Yes']

['Rain', 'Cool', 'Normal', 'Weak', 'Yes']

['Rain', 'Cool', 'Normal', 'Strong', 'No']

['Overcast', 'Cool', 'Normal', 'Strong', 'Yes']

['Sunny', 'Mild', 'High', 'Weak', 'No']

['Sunny', 'Cool', 'Normal', 'Weak', 'Yes']

['Rain', 'Mild', 'Normal', 'Weak', 'Yes']

['Sunny', 'Mild', 'Normal', 'Strong', 'Yes']

['Overcast', 'Mild', 'High', 'Strong', 'Yes']

['Overcast', 'Hot', 'Normal', 'Weak', 'Yes']

['Rain', 'Mild', 'High', 'Strong', 'No']

Most specific hypothesis is

[['0', '0', '0', '0', '0', '0']]

2.For a given set of training data examples stored in a .CSV file, implement and demonstrate the Candidate-Elimination algorithmto output a description of the set of all hypotheses consistent with the training examples.

import numpy as np

import pandas as pd

import csv

file = open('ai2.csv')

data = list(csv.reader(file))[:]

concepts = []

target = []

for i in data:

concepts.append(i[:-1])

target.append(i[-1])

specific_h = ['O']*len(concepts[0])

general_h = [['?' for _ in range(len(specific_h))] for _ in range(len(specific_h))]

for i, instance in enumerate(concepts):

if target[i] == "Yes":

for x in range(len(specific_h)):

if specific_h[x] == 'O':

specific_h[x] = instance[x]

elif instance[x] != specific_h[x]:

specific_h[x] = '?'

general_h[x][x] = '?'

if target[i] == "No":

for x in range(len(specific_h)):

general_h[x][x] = specific_h[x] if instance[x] != specific_h[x] else '?'

indices = [i for i, val in enumerate(general_h) if val == ['?', '?', '?', '?', '?', '?']]

for _ in indices:

general_h.remove(['?', '?', '?', '?', '?', '?'])

print("Final Specific : ", specific_h, sep = '\n')

print("Final General : ", general_h, sep = '\n')

Output

Final Specific :

['Sunny', 'Warm', '?', 'Strong', '?', '?']

Final General :

[['Sunny', '?', '?', '?', '?', '?'], ['?', 'Warm', '?', '?', '?', '?']]

3)Demonstrate working of decision tree based ID3 algorithm .Use an appropriate data set for building the decision tree and apply this knowledge to classify a new sample.

import pandas as pd

from pandas import DataFrame

df_tennis = pd.DataFrame(data=pd.read_csv('C:/Users/Akshatha/PycharmProjects/pythonPrograms/playtennis.csv'))

def entropy(probs):

import math

return sum([-prob * math.log(prob, 2) for prob in probs])

def entropy_of_list(a_list):

from collections import Counter

cnt = Counter(x for x in a_list)

print("No and Yes Classes:", a_list.name, cnt)

num_instances = len(a_list) * 1.0

probs = [x / num_instances for x in cnt.values()]

return entropy(probs)

total_entropy = entropy_of_list(df_tennis['playtennis'])

print("Entropy of given PlayTennis Data Set:", total_entropy)

def information_gain(df, split_attribute_name, target_attribute_name, trace=0):

print("Information Gain Calculation of ", split_attribute_name)

df_split = df.groupby(split_attribute_name)

for name, group in df_split:

print(name)

print(group)

nobs = len(df.index) * 1.0

df_agg_ent = df_split.agg({target_attribute_name: [entropy_of_list, lambda x:len(x) / nobs]})[target_attribute_name]

df_agg_ent.columns = ['Entropy', 'PropObservations']

print(df_agg_ent)

new_entropy = sum(df_agg_ent['Entropy'] * df_agg_ent['PropObservations'])

old_entropy = entropy_of_list(df[target_attribute_name])

return old_entropy - new_entropy

print("\n Info-gain for Outlook is :'+str( information_gain(df_tennis, 'outlook','playtennis')),\n")

print("\n Info-gain for Humidity is: ' + str( information_gain(df_tennis,'humidity','playtennis')),\n")

print("\n Info-gain for Wind is:' + str( information_gain(df_tennis, 'wind','playtennis')),\n")

print("\n Info-gain for Temperature is:' + str(information_gain(df_tennis, 'temperature','playtennis')),\

n")

def id3(df, target_attribute_name, attribute_names, default_class=None):

from collections import Counter

cnt = Counter(x for x in df[target_attribute_name])

if len(cnt) == 1:

return next(iter(cnt))

elif df.empty or (not attribute_names):

return default_class

else:

default_class = max(cnt.keys())

gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names]

index_of_max = gainz.index(max(gainz))

best_attr = attribute_names[index_of_max]

tree = {best_attr: {}}

remaining_attribute_names = [i for i in attribute_names if i != best_attr]

for attr_val, data_subset in df.groupby(best_attr):

subtree = id3(data_subset, target_attribute_name, remaining_attribute_names, default_class)

tree[best_attr][attr_val] = subtree

return tree

attribute_names = list(df_tennis.columns)

print("List of Attributes:", attribute_names)

attribute_names.remove('playtennis')

print("Predicting Attributes:", attribute_names)

from pprint import pprint

tree = id3(df_tennis, 'playtennis', attribute_names)

print("\n\nThe Resultant Decision Tree is :\n")

pprint(tree)

def classify(instance, tree, default=None):

attribute = next(iter(tree))

if instance[attribute] in tree[attribute].keys():

result = tree[attribute][instance[attribute]]

if isinstance(result, dict):

return classify(instance, result)

else:

return result # this is a label

else:

return default

df_tennis['predicted'] = df_tennis.apply(classify, axis=1, args=(tree, 'no'))

print('Accuracy is:' + str(sum(df_tennis['playtennis'] == df_tennis['predicted']) / (1.0 * len(df_tennis.index))

))

df_tennis[['playtennis', 'predicted']]

training_data = df_tennis.loc[1:-4]

test_data = df_tennis.loc[-4:]

train_tree = id3(training_data, 'playtennis', attribute_names)

test_data['predicted2'] = test_data.apply(

# <----test_data source

classify,

axis=1,

args=(train_tree, 'yes')) # <---- train_data tree

print('\n\n Accuracy is: ' + str(

sum(test_data['playtennis'] == test_data['predicted2']) / (1.0 * len(test_data.index))))

OUTPUT:

output is vey huge go n excecute😂

4.BuildanArtificial Neural Network by inmplementing the Backpropagation algoríthm and test the sanne using appropriate data sets.

import numpy as np

X = np.array(([2, 9], [1, 5], [3, 6]), dtype=float)

y= np.array(([92], [86], [89]), dtype=float)

X = X/np.amax(X,axis=0)

y = y/100

def sigmoid (x):

return 1/(1+ np.exp(-x))

def derivatives_sigmoid(x):

return x* (1-x)

epoch=7000

lr=0.1

inputlayer_neurons = 2

hiddenlayer_neurons=3

output_neurons = 1

wh=np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neurons))

bh=np.random.uniform(size=(1,hiddenlayer_neurons))

wout=np.random.uniform(size=(hiddenlayer_neurons,output_neurons))

bout=np.random.uniform(size=(1,output_neurons))

for i in range(epoch):

hinp1 = np.dot(X, wh)

hinp = hinp1 + bh

hlayer_act=sigmoid(hinp)

outinp1 = np.dot(hlayer_act, wout)

outinp=outinp1 + bout

output=sigmoid(outinp)

EO = y - output

outgrad = derivatives_sigmoid(output)

d_output = EO * outgrad

EH = d_output.dot(wout.T)

hiddengrad = derivatives_sigmoid(hlayer_act)

d_hiddenlayer = EH * hiddengrad

wout += hlayer_act.T.dot(d_output) * lr

print("Input: \n" + str(X))

print("Actual Output: \n" + str(y))

print("Predicted Output: \n", output)

OUTPUT:

Input:

[[0.66666667 1. ]

[0.33333333 0.55555556]

[1. 0.66666667]]

Actual Output:

[[0.92]

[0.86]

[0.89]]

Predicted Output:

[[0.88071043]

[0.86746983]

[0.88574158]]

5.WriteAÁ progran to implement the naive Bayesian classifier for a sample dafa set stored as a .CSVfile. Compute the accuracy few of the classifier, considering test data sets.

import csv

import random

import math

def loadcsv(filename):

lines = csv.reader(open(filename, "r"));

dataset = list(lines)

for i in range(len(dataset)):

# converting strings into numbers for processing

dataset[i] = [float(x) for x in dataset[i]]

return dataset

def splitdataset(dataset, splitratio):

# 67% training size

trainsize = int(len(dataset) * splitratio);

trainset = []

copy = list(dataset);

while len(trainset) < trainsize:

# generate indices for the dataset list randomly to pick ele for training data

index = random.randrange(len(copy));

trainset.append(copy.pop(index))

return [trainset, copy]

def separatebyclass(dataset):

separated = {} # dictionary of classes 1 and 0

# creates a dictionary of classes 1 and 0 where the values are

# the instances belonging to each class

for i in range(len(dataset)):

vector = dataset[i]

if (vector[-1] not in separated):

separated[vector[-1]] = []

separated[vector[-1]].append(vector)

return separated

def mean(numbers):

return sum(numbers) / float(len(numbers))

def stdev(numbers):

avg = mean(numbers)

variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)

return math.sqrt(variance)

def summarize(dataset): # creates a dictionary of classes

summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)];

del summaries[-1] # excluding labels +ve or -ve

return summaries

def summarizebyclass(dataset):

separated = separatebyclass(dataset);

# print(separated)

summaries = {}

for classvalue, instances in separated.items():

# for key,value in dic.items()

# summaries is a dic of tuples(mean,std) for each class value

summaries[classvalue] = summarize(instances) # summarize is used to cal to mean and std

return summaries

def calculateprobability(x, mean, stdev):

exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))

return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

def calculateclassprobabilities(summaries, inputvector):

probabilities = {} # probabilities contains the all prob of all class of test data

for classvalue, classsummaries in summaries.items(): # class and attribute information as mean and sd

probabilities[classvalue] = 1

for i in range(len(classsummaries)):

mean, stdev = classsummaries[i] # take mean and sd of every attribute for class 0 and 1 seperaely

x = inputvector[i] # testvector's first attribute

probabilities[classvalue] *= calculateprobability(x, mean, stdev); # use normal dist

return probabilities

def predict(summaries, inputvector): # training and test data is passed

probabilities = calculateclassprobabilities(summaries, inputvector)

bestLabel, bestProb = None, -1

for classvalue, probability in probabilities.items(): # assigns that class which has he highest prob

if bestLabel is None or probability > bestProb:

bestProb = probability

bestLabel = classvalue

return bestLabel

def getpredictions(summaries, testset):

predictions = []

for i in range(len(testset)):

result = predict(summaries, testset[i])

predictions.append(result)

return predictions

def getaccuracy(testset, predictions):

correct = 0

for i in range(len(testset)):

if testset[i][-1] == predictions[i]:

correct += 1

return (correct / float(len(testset))) * 100.0

def main():

filename = 'naivedata.csv'

splitratio = 0.67

dataset = loadcsv(filename);

trainingset, testset = splitdataset(dataset, splitratio)

print('Split {0} rows into train={1} and test={2} rows'.format(len(dataset), len(trainingset), len(testset)))

# prepare model

summaries = summarizebyclass(trainingset);

# print(summaries)

# test model

predictions = getpredictions(summaries, testset) # find the predictions of test data with the training data

accuracy = getaccuracy(testset, predictions)

print('Accuracy of the classifier is : {0}%'.format(accuracy))

main()

OUTPUT:

Split 768 rows into train=514 and test=254 rows

Accuracy of the classifier is : 77.16535433070865%

8.Demonstrate working of decision tree based ID3 algorithm .Use an appropriate data set for building the decision tree and apply this knowledge to classify a new sample.

import matplotlib.pyplot as plt

from sklearn import datasets

from sklearn.cluster import KMeans

import pandas as pd

import numpy as np

iris = datasets.load_iris()

X = pd.DataFrame(iris.data)

X.columns = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width']

y = pd.DataFrame(iris.target)

y.columns = ['Targets']

model = KMeans(n_clusters=3)

model.fit(X)

plt.figure(figsize=(14,14))

colormap = np.array(['red','lime','black'])

plt.subplot(2,2,1)

plt.scatter(X.Petal_Length,X.Petal_Width, c=colormap[y.Targets], s = 40)

plt.title('Real Clusters')

plt.xlabel('Petal Length')

plt.ylabel('Petal Width')

plt.subplot(2,2,2)

plt.scatter(X.Petal_Length,X.Petal_Width, c=colormap[model.labels_], s = 40)

plt.title('K-Means Clustering')

plt.xlabel('Petal Length')

plt.ylabel('Petal Width')

from sklearn import preprocessing

scaler = preprocessing.StandardScaler()

scaler.fit(X)

xsa = scaler.transform(X)

xs = pd.DataFrame(xsa,columns=X.columns)

from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=3)

gmm.fit(xs)

gmm_y=gmm.predict(xs)

plt.subplot(2,2,3)

plt.scatter(X.Petal_Length,X.Petal_Width, c=colormap[gmm_y], s = 40)

plt.title('GMM Clustering')

plt.xlabel('Petal Length')

plt.ylabel('Petal Width')

print('Observation: The GMM using EM algorithm based clustering matched the true labels more closely than the Kmeans.')

plt.show()

output:

Observation: The GMM using EM algorithm based clustering matched the true labels more closely than the Kmeans.

9.Write arprogram to m implement k-Nearest Neighbour algorithm 1o classify the iris dataset. Print both correct and wrong predictions. Java/Python ML library classes can be used for this problem.

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

from sklearn import datasets

iris = datasets.load_iris()

print("iris datasets loaded")

x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.1)

print("dataset is training and testing ....")

print("size of training data and its label", x_train.shape, y_train.shape)

print("size of training data and its label", x_test.shape, y_test.shape)

for i in range(len(iris.target_names)):

print("label", i, "-", str(iris.target_names[i]))

classifier = KNeighborsClassifier(n_neighbors=1)

classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)

print("Results of classification using K-NN K=1")

for r in range(0, len(x_test)):

print("Sample: ", str(x_test[r]), "Actual-Label: ", str(y_test[r]), "Predicted-Label: ", str(y_pred[r]))

print("Classification Accuracy: ", classifier.score(x_test, y_test))

OUTPUT:

iris datasets loaded

dataset is training and testing ....

size of training data and its label (135, 4) (135,)

size of training data and its label (15, 4) (15,)

label 0 - setosa

Results of classification using K-NN K=1

Sample: [5. 2.3 3.3 1. ] Actual-Label: 1 Predicted-Label: 1