python(BI)

PRACTICAL NO.1

AIM: Perform the data classification using classification algorithm.

1A. Perform the data classification using Naïve Baye’s Algorithm.

from sklearn.datasets import load_iris

iris = load_iris()

X = iris.data

y = iris.target

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=1)

from sklearn.naive_bayes import GaussianNB

gnb=GaussianNB()

gnb.fit(X_train,y_train)

y_pred=gnb.predict(X_test)

from sklearn import metrics

print("Gaussian Naive Bayes model accuracy(in%):",metrics.accuracy_score(y_test,y_pred)*100)

EXAMPLE-span.csv

import numpy as np

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB

df=pd.read_csv('/content/spam1.csv',encoding='latin-1')

df=df[['Message','Category']]

df.columns=['SMS','Type']

countvec=CountVectorizer(ngram_range=(1,4),stop_words='english',strip_acce

nts='unicode',max_features=1000)

bow =countvec.fit_transform(df.SMS)

X_train=bow.toarray()

Y_train=df.Type.values

mnb=MultinomialNB()

mnb.fit(X_train,Y_train)

text1=countvec.transform(['Free gifts for all'])

print('Free gift for all')

print(mnb.predict(text1))

text2=countvec.transform(['We will go for a lunch'])

print('We will go for a lunch')

print(mnb.predict(text2))

EXAMPLE-CORRECTED

import numpy as np

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB

import os

file_path = 'spam.csv'

df = pd.read_csv(file_path, encoding='latin-1')

df = df.rename(columns={'v1': 'Type', 'v2': 'SMS'})

df = df[['SMS', 'Type']]

df['Type'] = df['Type'].map({'ham': 0, 'spam': 1})

df = df.dropna()

countvec = CountVectorizer(ngram_range=(1, 4), stop_words='english', strip_accents='unicode', max_features=1000)

bow = countvec.fit_transform(df['SMS'])

X_train = bow.toarray()

Y_train = df['Type'].values

mnb = MultinomialNB()

mnb.fit(X_train, Y_train)

test_messages = ["Free gifts for all", "We will go for a lunch"]

test_bow = countvec.transform(test_messages)

for msg, pred in zip(test_messages, mnb.predict(test_bow)):

print(f"Message: '{msg}' => Prediction: {'Spam' if pred == 1 else 'Ham'}")

1B. Perform the data classification using SVM classifier.

from sklearn import svm, datasets

import matplotlib.pyplot as plt

import numpy as np

from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split

iris=datasets.load_iris()

X=iris.data[:,:2]

Y=iris.target

x_train,x_test,y_train,y_test=train_test_split(X,Y,random_state=0,test_size=0.3)

clf=svm.SVC(kernel="linear",C=1).fit(x_train,y_train)

classifier_predictions=clf.predict(x_test)

print(accuracy_score(y_test,classifier_predictions)*100)

h=0.02

x_min,x_max=X[:,0].min()-1,X[:,0].max()+1

y_min,y_max=X[:,1].min()-1,X[:,1].max()+1

xx,yy=np.meshgrid(np.arange(x_min,x_max,h),np.arange(y_min,y_max,h))

Z=clf.predict(np.c_[xx.ravel(),yy.ravel()])

Z=Z.reshape(xx.shape)

plt.contourf(xx,yy,Z,cmap=plt.cm.coolwarm,alpha=0.3)

plt.scatter(X[:,0],X[:,1],c=Y,cmap=plt.cm.coolwarm)

plt.xlabel('Sepal length')

plt.ylabel('Sepal width')

plt.xlim(xx.min(),xx.max())

plt.ylim(yy.min(),yy.max())

plt.title("Linear")

plt.show()

PRACTICAL NO.2

AIM: Perform the data clustering using clustering algorithm.

2A.To demonstrate unsupervised Machine Learning Algorithm, Partitioned Clustering technique(KMeans Clustering Algorithm).

import numpy as nm

import matplotlib.pyplot as mtp

import pandas as pd

dataset=pd.read_csv('Mall_Customers.csv')

x=dataset.iloc[:,[3,4]].values

from sklearn.cluster import KMeans

wcss_list=[]

for i in range(1,11):

kmeans=KMeans(n_clusters=i,init='k-means++',random_state=42)

kmeans.fit(x)

wcss_list.append(kmeans.inertia_)

mtp.plot(range(1,11),wcss_list)

mtp.title('The Elbow Method Graph')

mtp.xlabel('Number of clusters(k)')

mtp.ylabel('wcss_list')

mtp.show()

kmeans=KMeans(n_clusters=5,init='k-means++',random_state=42)

y_predict=kmeans.fit_predict(x)

mtp.scatter(x[y_predict==0,0],x[y_predict==0,1],s=100,c='blue',label='Cluster 1')

mtp.scatter(x[y_predict==1,0],x[y_predict==1,1],s=100,c='green',label='Cluster 2')

mtp.scatter(x[y_predict==2,0],x[y_predict==2,1],s=100,c='red',label='Cluster 3')

mtp.scatter(x[y_predict==3,0],x[y_predict==3,1],s=100,c='cyan',label='Cluster 4')

mtp.scatter(x[y_predict==4,0],x[y_predict==4,1],s=100,c='magenta',label='Cluster 5')

mtp.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=300,c='yellow',label='Centroid')

mtp.title('Clusters of customers')

mtp.xlabel('Annual income(k$)')

mtp.ylabel('Spending Score(1-100)')

mtp.legend()

mtp.show()

2B.Solving the Wholesale Customer Segmentation Peroblem using Hierarchical Clustering (agglomerative Clustering algorithm)

import numpy as nm

import matplotlib.pyplot as mtp

import pandas as pd

dataset = pd.read_csv('Mall_Customers.csv')

x = dataset.iloc[:, [3, 4]].values

import scipy.cluster.hierarchy as shc

dendro = shc.dendrogram(shc.linkage(x, method="ward"))

mtp.title("Dendrogrma Plot")

mtp.ylabel("Euclidean Distances")

mtp.xlabel("Customers")

mtp.show()

from sklearn.cluster import AgglomerativeClustering

hc= AgglomerativeClustering(n_clusters=5, linkage='ward')

y_pred= hc.fit_predict(x)

mtp.scatter(x[y_pred == 0, 0], x[y_pred == 0, 1], s = 100, c = 'blue', label = 'Cluster 1')

mtp.scatter(x[y_pred == 1, 0], x[y_pred == 1, 1], s = 100, c = 'green', label = 'Cluster 2')

mtp.scatter(x[y_pred == 2, 0], x[y_pred == 2, 1], s = 100, c = 'red', label = 'Cluster 3')

mtp.scatter(x[y_pred == 3, 0], x[y_pred == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')

mtp.scatter(x[y_pred == 4, 0], x[y_pred == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')

mtp.title('Clusters of customers')

mtp.xlabel('Annual Income (k$)')

mtp.ylabel('Spending Score (1-100)')

mtp.legend()

mtp.show()

PRACTICAL NO.3

AIM: Perform the linear regression on the given data warehouse.

import numpy as nm

import matplotlib.pyplot as mtp

import pandas as pd

data_set=pd.read_csv('Salary_data.csv')

x=data_set.iloc[:, :-1].values

y=data_set.iloc[:, 1].values

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 1/3, random_state=0)

from sklearn.linear_model import LinearRegression

regressor= LinearRegression()

regressor.fit(x_train, y_train)

y_pred= regressor.predict(x_test)

x_pred= regressor.predict(x_train)

mtp.scatter(x_train, y_train, color="green")

mtp.plot(x_train, x_pred, color="red")

mtp.title("Salary vs Experience (Training Dataset)")

mtp.xlabel("Years of Experience")

mtp.ylabel("Salary(In Rupees)")

mtp.show()

mtp.scatter(x_test, y_test, color="blue")

mtp.plot(x_train, x_pred, color="red")

mtp.title("Salary vs Experience (Test Dataset)")

mtp.xlabel("Years of Experience")

mtp.ylabel("Salary(In Rupees)")

mtp.show()

PRACTICAL NO.4

AIM: Perform the logistic regression on the given data warehouse data.

EXAMPLE-MISS

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

student_data = pd.read_csv("/content/Admission_P1A - Admission_P1A.csv")

col_names = student_data.columns

student_data.head(10)

feature_cols = ['gre', 'gpa', 'rank']

X = student_data[feature_cols]

Y=student_data.admit

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,

random_state=1)

clf = LogisticRegression()

clf.fit(X_train,Y_train)

clf = clf.fit(X_train,Y_train)

Y_pred = clf.predict(X_test)

print("Accuracy:",round(accuracy_score(Y_test, Y_pred),1))

new={'gre':[260],'gpa':[2.67],'rank':[1] }

sc2 = pd.DataFrame(new,columns= ['gre','gpa','rank'])

Y_pred=clf.predict(sc2)

print (sc2)

print ("Forecast is:",)

EXAMPLE-OWN

import numpy as nm

import pandas as pd

import matplotlib.pyplot as mtp

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

dataset = pd.read_csv("admission.csv") # Adjust file path if needed

x = dataset.iloc[:, [1, 2, 3]].values # Selecting 'gre', 'gpa', 'rank'

y = dataset.iloc[:, 0].values # Selecting 'admit' column

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

clf = LogisticRegression()

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

print("Accuracy:", round(accuracy_score(y_test, y_pred), 2))

new = nm.array([[260, 2.67, 1]]) # Example values

prediction = clf.predict(new)

print("New Data:", new)

print("Forecast:", prediction[0])

PRACTICAL NO.5

AIM: Perform data analysis using Time series

Data analysis using time series analysis involves examining a sequence of

data points collected over a period of time, at regular intervals, to identify

patterns, trends, seasonality, and other recurring behaviors within the data,

allowing for predictions about future values based on historical observations.

Example 1:

import matplotlib.pyplot as plt

import pandas as pd

days=['Saturday','Sunday','Monday','Tuesday','Wednesday','Thursday','Friday']

calories=[1670,2011,1853,2557,1390,2118,2063]

df_days_calories=pd.DataFrame({'day':days,'calories':calories})

ax=plt.gca()

df_days_calories.plot(x='day',y='calories',ax=ax)

plt.show()

EXAMPLE -2

import pandas as pd

import matplotlib.pyplot as plt

subjects = ['Math', 'English', 'History', 'Chem', 'Geo', 'Physics', 'Bio', 'CS']

stress = [9, 3, 5, 1, 8, 5, 10, 2]

grades = [15, 10, 7, 8, 11, 8, 17, 20]

df_days_calories = pd.DataFrame({'Subject': subjects, 'Stress': stress, 'Grade': grades})

ax = plt.gca()

df_days_calories.plot(x='Subject', y='Stress', ax=ax)

df_days_calories.plot(x='Subject', y='Grade', ax=ax)

plt.show()

EXAMPLE -3(string)

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from statsmodels.tsa.ar_model import AutoReg

from sklearn.metrics import mean_squared_error

from math import sqrt

series = pd.read_csv(

'1_Daily_minimum_temps.csv',

header=0,

index_col=0,

parse_dates=True,

infer_datetime_format=True,

dayfirst=True,

na_values=['?', 'NA', '', 'null']

)

series = series.apply(pd.to_numeric, errors='coerce')

series.dropna(inplace=True)

X = series.values

train, test = X[:-7], X[-7:]

model = AutoReg(train, lags=30)

model_fit = model.fit()

print('Lag: %s' % model_fit.ar_lags)

print('Coefficients: %s' % model_fit.params)

predictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False)

for i in range(len(predictions)):

print('Predicted=%.3f, Expected=%.3f' % (predictions[i], test[i]))

rmse = sqrt(mean_squared_error(test, predictions))

print('Test RMSE: %.3f' % rmse)

plt.plot(test, label="Actual")

plt.plot(predictions, color='red', label="Predicted")

plt.legend()

plt.show()

EXAMPLE -3(numeric)

from pandas import read_csv

from matplotlib import pyplot as plt

from statsmodels.tsa.ar_model import AutoReg

from sklearn.metrics import mean_squared_error

from math import sqrt

series = read_csv('daily-min-temperatures.csv', header=0, index_col=0, parse_dates=True)

X = series.values

train, test = X[:-7], X[-7:]

model = AutoReg(train, lags=30)

model_fit = model.fit()

print('Lag: %s' % model_fit.ar_lags)

print('Coefficients: %s' % model_fit.params)

predictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False)

for i in range(len(predictions)):

print('Predicted=%.3f, Expected=%.3f' % (predictions[i], test[i]))

rmse = sqrt(mean_squared_error(test, predictions))

print('Test RMSE: %.3f' % rmse)

plt.plot(test, label="Actual")

plt.plot(predictions, color='red', label="Predicted")

plt.legend()

plt.show()

EXAMPLE -3(Miss)

from pandas import read_csv

from matplotlib import pyplot

from statsmodels.tsa.ar_model import AutoReg

from sklearn.metrics import mean_squared_error

from math import sqrt

series = read_csv('/content/daily-min-temperatures.csv', header=0,

index_col=0,parse_dates=True, squeeze=True)

X = series.values

train, test = X[1:len(X)-7], X[len(X)-7:]

model = AutoReg(train,30)

model_fit = model.fit()

print('Lag: %s' % model_fit.ar_lags)

print('Coefficients: %s' % model_fit.params)

predictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1,

dynamic=False)

for i in range(len(predictions)):

print('predicted=%f, expected=%f' % (predictions[i], test[i]))

rmse = sqrt(mean_squared_error(test, predictions))

print('Test RMSE: %.3f' % rmse)

pyplot.plot(test)

pyplot.plot(predictions, color='red')

pyplot.show()

Example-Outside

import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.ar_model import AutoReg
 
data = pd.read_csv("daily-min-temperatures.csv", index_col=0, parse_dates=True)
 
train, test = data[:-7], data[-7:]
 
model = AutoReg(train, lags=7).fit()
 
predictions = model.predict(start=len(train), end=len(train) + len(test) - 1)
 
plt.plot(test, label="Actual Temperature")
plt.plot(predictions, color='red', label="Predicted Temperature")
plt.xlabel("Time")
plt.ylabel("Temperature")
plt.title("Time Series Forecasting")
plt.legend()
plt.show()

Search This Blog

ZVSkills

python(BI)

Comments

Post a Comment

Popular posts from this blog

Prac_8(AMP)

LSA10