python(BI)

PRACTICAL NO.1

AIM: Perform the data classification using classification algorithm. 

1A. Perform the data classification using Naïve Baye’s Algorithm.

from sklearn.datasets import load_iris

iris = load_iris()

X = iris.data

y = iris.target

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=1)

from sklearn.naive_bayes import GaussianNB

gnb=GaussianNB()

gnb.fit(X_train,y_train)

y_pred=gnb.predict(X_test)

from sklearn import metrics

print("Gaussian Naive Bayes model accuracy(in%):",metrics.accuracy_score(y_test,y_pred)*100)

EXAMPLE-span.csv

import numpy as np

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB

df=pd.read_csv('/content/spam1.csv',encoding='latin-1')

df=df[['Message','Category']]

df.columns=['SMS','Type']

countvec=CountVectorizer(ngram_range=(1,4),stop_words='english',strip_acce

nts='unicode',max_features=1000)

bow =countvec.fit_transform(df.SMS)

X_train=bow.toarray()

Y_train=df.Type.values

mnb=MultinomialNB()

mnb.fit(X_train,Y_train)

text1=countvec.transform(['Free gifts for all'])

print('Free gift for all')

print(mnb.predict(text1))

text2=countvec.transform(['We will go for a lunch'])

print('We will go for a lunch')

print(mnb.predict(text2))

EXAMPLE-CORRECTED 

import numpy as np

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB

import os

file_path = 'spam.csv'

df = pd.read_csv(file_path, encoding='latin-1')

df = df.rename(columns={'v1': 'Type', 'v2': 'SMS'})

df = df[['SMS', 'Type']]

df['Type'] = df['Type'].map({'ham': 0, 'spam': 1})

df = df.dropna()

countvec = CountVectorizer(ngram_range=(1, 4), stop_words='english', strip_accents='unicode', max_features=1000)

bow = countvec.fit_transform(df['SMS'])

X_train = bow.toarray()

Y_train = df['Type'].values

mnb = MultinomialNB()

mnb.fit(X_train, Y_train)

test_messages = ["Free gifts for all", "We will go for a lunch"]

test_bow = countvec.transform(test_messages)

for msg, pred in zip(test_messages, mnb.predict(test_bow)):

    print(f"Message: '{msg}' => Prediction: {'Spam' if pred == 1 else 'Ham'}")

1B. Perform the data classification using SVM classifier. 

 from sklearn import svm, datasets

import matplotlib.pyplot as plt

import numpy as np

from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split

iris=datasets.load_iris()

X=iris.data[:,:2]

Y=iris.target

x_train,x_test,y_train,y_test=train_test_split(X,Y,random_state=0,test_size=0.3)

clf=svm.SVC(kernel="linear",C=1).fit(x_train,y_train)

classifier_predictions=clf.predict(x_test)

print(accuracy_score(y_test,classifier_predictions)*100)

h=0.02

x_min,x_max=X[:,0].min()-1,X[:,0].max()+1

y_min,y_max=X[:,1].min()-1,X[:,1].max()+1

xx,yy=np.meshgrid(np.arange(x_min,x_max,h),np.arange(y_min,y_max,h))

Z=clf.predict(np.c_[xx.ravel(),yy.ravel()])

Z=Z.reshape(xx.shape)

plt.contourf(xx,yy,Z,cmap=plt.cm.coolwarm,alpha=0.3)

plt.scatter(X[:,0],X[:,1],c=Y,cmap=plt.cm.coolwarm)

plt.xlabel('Sepal length')

plt.ylabel('Sepal width')

plt.xlim(xx.min(),xx.max())

plt.ylim(yy.min(),yy.max())

plt.title("Linear")

plt.show()

PRACTICAL NO.2

AIM: Perform the data clustering using clustering algorithm. 

2A.To demonstrate unsupervised Machine Learning Algorithm, Partitioned Clustering technique(KMeans Clustering Algorithm).

import numpy as nm

import matplotlib.pyplot as mtp

import pandas as pd

dataset=pd.read_csv('Mall_Customers.csv')

x=dataset.iloc[:,[3,4]].values

from sklearn.cluster import KMeans

wcss_list=[]

for i in range(1,11):

    kmeans=KMeans(n_clusters=i,init='k-means++',random_state=42)

    kmeans.fit(x)

    wcss_list.append(kmeans.inertia_)

mtp.plot(range(1,11),wcss_list)

mtp.title('The Elbow Method Graph')

mtp.xlabel('Number of clusters(k)')

mtp.ylabel('wcss_list')

mtp.show()

kmeans=KMeans(n_clusters=5,init='k-means++',random_state=42)

y_predict=kmeans.fit_predict(x)

mtp.scatter(x[y_predict==0,0],x[y_predict==0,1],s=100,c='blue',label='Cluster 1')

mtp.scatter(x[y_predict==1,0],x[y_predict==1,1],s=100,c='green',label='Cluster 2')

mtp.scatter(x[y_predict==2,0],x[y_predict==2,1],s=100,c='red',label='Cluster 3')

mtp.scatter(x[y_predict==3,0],x[y_predict==3,1],s=100,c='cyan',label='Cluster 4')

mtp.scatter(x[y_predict==4,0],x[y_predict==4,1],s=100,c='magenta',label='Cluster 5')

mtp.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=300,c='yellow',label='Centroid')

mtp.title('Clusters of customers')

mtp.xlabel('Annual income(k$)')

mtp.ylabel('Spending Score(1-100)')

mtp.legend()

mtp.show()

2B.Solving the Wholesale Customer Segmentation Peroblem using Hierarchical Clustering (agglomerative Clustering algorithm)

import numpy as nm 

import matplotlib.pyplot as mtp 

import pandas as pd 

dataset = pd.read_csv('Mall_Customers.csv') 

x = dataset.iloc[:, [3, 4]].values 

import scipy.cluster.hierarchy as shc 

dendro = shc.dendrogram(shc.linkage(x, method="ward")) 

mtp.title("Dendrogrma Plot") 

mtp.ylabel("Euclidean Distances") 

mtp.xlabel("Customers") 

mtp.show() 

from sklearn.cluster import AgglomerativeClustering 

hc= AgglomerativeClustering(n_clusters=5, linkage='ward') 

y_pred= hc.fit_predict(x) 

mtp.scatter(x[y_pred == 0, 0], x[y_pred == 0, 1], s = 100, c = 'blue', label = 'Cluster 1') 

mtp.scatter(x[y_pred == 1, 0], x[y_pred == 1, 1], s = 100, c = 'green', label = 'Cluster 2') 

mtp.scatter(x[y_pred == 2, 0], x[y_pred == 2, 1], s = 100, c = 'red', label = 'Cluster 3') 

mtp.scatter(x[y_pred == 3, 0], x[y_pred == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4') 

mtp.scatter(x[y_pred == 4, 0], x[y_pred == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5') 

mtp.title('Clusters of customers') 

mtp.xlabel('Annual Income (k$)') 

mtp.ylabel('Spending Score (1-100)') 

mtp.legend()

mtp.show()

PRACTICAL NO.3 
AIM: Perform the linear regression on the given data warehouse.
import numpy as nm
import matplotlib.pyplot as mtp
import pandas as pd
data_set=pd.read_csv('Salary_data.csv')
x=data_set.iloc[:, :-1].values
y=data_set.iloc[:, 1].values
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 1/3, random_state=0)
from sklearn.linear_model import LinearRegression
regressor= LinearRegression()
regressor.fit(x_train, y_train)
y_pred= regressor.predict(x_test)
x_pred= regressor.predict(x_train)
mtp.scatter(x_train, y_train, color="green")
mtp.plot(x_train, x_pred, color="red")
mtp.title("Salary vs Experience (Training Dataset)")
mtp.xlabel("Years of Experience")
mtp.ylabel("Salary(In Rupees)")
mtp.show()
mtp.scatter(x_test, y_test, color="blue")
mtp.plot(x_train, x_pred, color="red")
mtp.title("Salary vs Experience (Test Dataset)")
mtp.xlabel("Years of Experience")
mtp.ylabel("Salary(In Rupees)")
mtp.show()
PRACTICAL NO.4 
AIM: Perform the logistic regression on the given data warehouse data.
EXAMPLE-MISS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
student_data = pd.read_csv("/content/Admission_P1A - Admission_P1A.csv")
col_names = student_data.columns
student_data.head(10)
feature_cols = ['gre', 'gpa', 'rank']
X = student_data[feature_cols]
Y=student_data.admit
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,
random_state=1)
clf = LogisticRegression()
clf.fit(X_train,Y_train)
clf = clf.fit(X_train,Y_train)
Y_pred = clf.predict(X_test)
print("Accuracy:",round(accuracy_score(Y_test, Y_pred),1))
new={'gre':[260],'gpa':[2.67],'rank':[1] }
sc2 = pd.DataFrame(new,columns= ['gre','gpa','rank'])
Y_pred=clf.predict(sc2)
print (sc2)
print ("Forecast is:",)
EXAMPLE-OWN
import numpy as nm
import pandas as pd
import matplotlib.pyplot as mtp
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
dataset = pd.read_csv("admission.csv")  # Adjust file path if needed
x = dataset.iloc[:, [1, 2, 3]].values  # Selecting 'gre', 'gpa', 'rank'
y = dataset.iloc[:, 0].values  # Selecting 'admit' column
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

clf = LogisticRegression()
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
print("Accuracy:", round(accuracy_score(y_test, y_pred), 2))
new = nm.array([[260, 2.67, 1]])  # Example values
prediction = clf.predict(new)

print("New Data:", new)
print("Forecast:", prediction[0])

PRACTICAL NO.5

AIM: Perform data analysis using Time series

Data analysis using time series analysis involves examining a sequence of

data points collected over a period of time, at regular intervals, to identify

patterns, trends, seasonality, and other recurring behaviors within the data,

allowing for predictions about future values based on historical observations.

Example 1:

import matplotlib.pyplot as plt

import pandas as pd

days=['Saturday','Sunday','Monday','Tuesday','Wednesday','Thursday','Friday']

calories=[1670,2011,1853,2557,1390,2118,2063]

df_days_calories=pd.DataFrame({'day':days,'calories':calories})

ax=plt.gca()

df_days_calories.plot(x='day',y='calories',ax=ax)

plt.show()

EXAMPLE -2

import pandas as pd

import matplotlib.pyplot as plt

subjects = ['Math', 'English', 'History', 'Chem', 'Geo', 'Physics', 'Bio', 'CS']

stress = [9, 3, 5, 1, 8, 5, 10, 2]

grades = [15, 10, 7, 8, 11, 8, 17, 20]

df_days_calories = pd.DataFrame({'Subject': subjects, 'Stress': stress, 'Grade': grades})

ax = plt.gca()

df_days_calories.plot(x='Subject', y='Stress', ax=ax)

df_days_calories.plot(x='Subject', y='Grade', ax=ax)

plt.show()

EXAMPLE -3(string)

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from statsmodels.tsa.ar_model import AutoReg

from sklearn.metrics import mean_squared_error

from math import sqrt

series = pd.read_csv(

    '1_Daily_minimum_temps.csv', 

    header=0, 

    index_col=0, 

    parse_dates=True, 

    infer_datetime_format=True,

    dayfirst=True,

    na_values=['?', 'NA', '', 'null']  

)

series = series.apply(pd.to_numeric, errors='coerce')

series.dropna(inplace=True)

X = series.values

train, test = X[:-7], X[-7:]

model = AutoReg(train, lags=30)

model_fit = model.fit()

print('Lag: %s' % model_fit.ar_lags)

print('Coefficients: %s' % model_fit.params)

predictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False)

for i in range(len(predictions)):

    print('Predicted=%.3f, Expected=%.3f' % (predictions[i], test[i]))

rmse = sqrt(mean_squared_error(test, predictions))

print('Test RMSE: %.3f' % rmse)

plt.plot(test, label="Actual")

plt.plot(predictions, color='red', label="Predicted")

plt.legend()

plt.show()

EXAMPLE -3(numeric)

from pandas import read_csv

from matplotlib import pyplot as plt

from statsmodels.tsa.ar_model import AutoReg

from sklearn.metrics import mean_squared_error

from math import sqrt

series = read_csv('daily-min-temperatures.csv', header=0, index_col=0, parse_dates=True)

X = series.values

train, test = X[:-7], X[-7:]

model = AutoReg(train, lags=30)

model_fit = model.fit()

print('Lag: %s' % model_fit.ar_lags)

print('Coefficients: %s' % model_fit.params)

predictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False)

for i in range(len(predictions)):

    print('Predicted=%.3f, Expected=%.3f' % (predictions[i], test[i]))

rmse = sqrt(mean_squared_error(test, predictions))

print('Test RMSE: %.3f' % rmse)

plt.plot(test, label="Actual")

plt.plot(predictions, color='red', label="Predicted")

plt.legend()

plt.show()

EXAMPLE -3(Miss)

from pandas import read_csv

from matplotlib import pyplot

from statsmodels.tsa.ar_model import AutoReg

from sklearn.metrics import mean_squared_error

from math import sqrt

series = read_csv('/content/daily-min-temperatures.csv', header=0,

index_col=0,parse_dates=True, squeeze=True)

X = series.values

train, test = X[1:len(X)-7], X[len(X)-7:]

model = AutoReg(train,30)

model_fit = model.fit()

print('Lag: %s' % model_fit.ar_lags)

print('Coefficients: %s' % model_fit.params)

predictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1,

dynamic=False)

for i in range(len(predictions)):

print('predicted=%f, expected=%f' % (predictions[i], test[i]))

rmse = sqrt(mean_squared_error(test, predictions))

print('Test RMSE: %.3f' % rmse)

pyplot.plot(test)

pyplot.plot(predictions, color='red')

pyplot.show()

Example-Outside
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.ar_model import AutoReg
 
data = pd.read_csv("daily-min-temperatures.csv", index_col=0, parse_dates=True)
 
train, test = data[:-7], data[-7:]
 
model = AutoReg(train, lags=7).fit()
 
predictions = model.predict(start=len(train), end=len(train) + len(test) - 1)
 
plt.plot(test, label="Actual Temperature")
plt.plot(predictions, color='red', label="Predicted Temperature")
plt.xlabel("Time")
plt.ylabel("Temperature")
plt.title("Time Series Forecasting")
plt.legend()
plt.show()



Comments

Popular posts from this blog

Prac_8(AMP)

LSA10