python(BI)
PRACTICAL NO.1
AIM: Perform the data classification using classification algorithm.
1A. Perform the data classification using Naïve Baye’s Algorithm.
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=1)
from sklearn.naive_bayes import GaussianNB
gnb=GaussianNB()
gnb.fit(X_train,y_train)
y_pred=gnb.predict(X_test)
from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in%):",metrics.accuracy_score(y_test,y_pred)*100)
EXAMPLE-span.csv
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
df=pd.read_csv('/content/spam1.csv',encoding='latin-1')
df=df[['Message','Category']]
df.columns=['SMS','Type']
countvec=CountVectorizer(ngram_range=(1,4),stop_words='english',strip_acce
nts='unicode',max_features=1000)
bow =countvec.fit_transform(df.SMS)
X_train=bow.toarray()
Y_train=df.Type.values
mnb=MultinomialNB()
mnb.fit(X_train,Y_train)
text1=countvec.transform(['Free gifts for all'])
print('Free gift for all')
print(mnb.predict(text1))
text2=countvec.transform(['We will go for a lunch'])
print('We will go for a lunch')
print(mnb.predict(text2))
EXAMPLE-CORRECTED
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import os
file_path = 'spam.csv'
df = pd.read_csv(file_path, encoding='latin-1')
df = df.rename(columns={'v1': 'Type', 'v2': 'SMS'})
df = df[['SMS', 'Type']]
df['Type'] = df['Type'].map({'ham': 0, 'spam': 1})
df = df.dropna()
countvec = CountVectorizer(ngram_range=(1, 4), stop_words='english', strip_accents='unicode', max_features=1000)
bow = countvec.fit_transform(df['SMS'])
X_train = bow.toarray()
Y_train = df['Type'].values
mnb = MultinomialNB()
mnb.fit(X_train, Y_train)
test_messages = ["Free gifts for all", "We will go for a lunch"]
test_bow = countvec.transform(test_messages)
for msg, pred in zip(test_messages, mnb.predict(test_bow)):
print(f"Message: '{msg}' => Prediction: {'Spam' if pred == 1 else 'Ham'}")
1B. Perform the data classification using SVM classifier.
from sklearn import svm, datasets
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
iris=datasets.load_iris()
X=iris.data[:,:2]
Y=iris.target
x_train,x_test,y_train,y_test=train_test_split(X,Y,random_state=0,test_size=0.3)
clf=svm.SVC(kernel="linear",C=1).fit(x_train,y_train)
classifier_predictions=clf.predict(x_test)
print(accuracy_score(y_test,classifier_predictions)*100)
h=0.02
x_min,x_max=X[:,0].min()-1,X[:,0].max()+1
y_min,y_max=X[:,1].min()-1,X[:,1].max()+1
xx,yy=np.meshgrid(np.arange(x_min,x_max,h),np.arange(y_min,y_max,h))
Z=clf.predict(np.c_[xx.ravel(),yy.ravel()])
Z=Z.reshape(xx.shape)
plt.contourf(xx,yy,Z,cmap=plt.cm.coolwarm,alpha=0.3)
plt.scatter(X[:,0],X[:,1],c=Y,cmap=plt.cm.coolwarm)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(),xx.max())
plt.ylim(yy.min(),yy.max())
plt.title("Linear")
plt.show()
PRACTICAL NO.2
AIM: Perform the data clustering using clustering algorithm.
2A.To demonstrate unsupervised Machine Learning Algorithm, Partitioned Clustering technique(KMeans Clustering Algorithm).
import numpy as nm
import matplotlib.pyplot as mtp
import pandas as pd
dataset=pd.read_csv('Mall_Customers.csv')
x=dataset.iloc[:,[3,4]].values
from sklearn.cluster import KMeans
wcss_list=[]
for i in range(1,11):
kmeans=KMeans(n_clusters=i,init='k-means++',random_state=42)
kmeans.fit(x)
wcss_list.append(kmeans.inertia_)
mtp.plot(range(1,11),wcss_list)
mtp.title('The Elbow Method Graph')
mtp.xlabel('Number of clusters(k)')
mtp.ylabel('wcss_list')
mtp.show()
kmeans=KMeans(n_clusters=5,init='k-means++',random_state=42)
y_predict=kmeans.fit_predict(x)
mtp.scatter(x[y_predict==0,0],x[y_predict==0,1],s=100,c='blue',label='Cluster 1')
mtp.scatter(x[y_predict==1,0],x[y_predict==1,1],s=100,c='green',label='Cluster 2')
mtp.scatter(x[y_predict==2,0],x[y_predict==2,1],s=100,c='red',label='Cluster 3')
mtp.scatter(x[y_predict==3,0],x[y_predict==3,1],s=100,c='cyan',label='Cluster 4')
mtp.scatter(x[y_predict==4,0],x[y_predict==4,1],s=100,c='magenta',label='Cluster 5')
mtp.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=300,c='yellow',label='Centroid')
mtp.title('Clusters of customers')
mtp.xlabel('Annual income(k$)')
mtp.ylabel('Spending Score(1-100)')
mtp.legend()
mtp.show()
import numpy as nm
import matplotlib.pyplot as mtp
import pandas as pd
dataset = pd.read_csv('Mall_Customers.csv')
x = dataset.iloc[:, [3, 4]].values
import scipy.cluster.hierarchy as shc
dendro = shc.dendrogram(shc.linkage(x, method="ward"))
mtp.title("Dendrogrma Plot")
mtp.ylabel("Euclidean Distances")
mtp.xlabel("Customers")
mtp.show()
from sklearn.cluster import AgglomerativeClustering
hc= AgglomerativeClustering(n_clusters=5, linkage='ward')
y_pred= hc.fit_predict(x)
mtp.scatter(x[y_pred == 0, 0], x[y_pred == 0, 1], s = 100, c = 'blue', label = 'Cluster 1')
mtp.scatter(x[y_pred == 1, 0], x[y_pred == 1, 1], s = 100, c = 'green', label = 'Cluster 2')
mtp.scatter(x[y_pred == 2, 0], x[y_pred == 2, 1], s = 100, c = 'red', label = 'Cluster 3')
mtp.scatter(x[y_pred == 3, 0], x[y_pred == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
mtp.scatter(x[y_pred == 4, 0], x[y_pred == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
mtp.title('Clusters of customers')
mtp.xlabel('Annual Income (k$)')
mtp.ylabel('Spending Score (1-100)')
mtp.legend()
mtp.show()
PRACTICAL NO.5
AIM: Perform data analysis using Time series
Data analysis using time series analysis involves examining a sequence of
data points collected over a period of time, at regular intervals, to identify
patterns, trends, seasonality, and other recurring behaviors within the data,
allowing for predictions about future values based on historical observations.
Example 1:
import matplotlib.pyplot as plt
import pandas as pd
days=['Saturday','Sunday','Monday','Tuesday','Wednesday','Thursday','Friday']
calories=[1670,2011,1853,2557,1390,2118,2063]
df_days_calories=pd.DataFrame({'day':days,'calories':calories})
ax=plt.gca()
df_days_calories.plot(x='day',y='calories',ax=ax)
plt.show()
EXAMPLE -2
import pandas as pd
import matplotlib.pyplot as plt
subjects = ['Math', 'English', 'History', 'Chem', 'Geo', 'Physics', 'Bio', 'CS']
stress = [9, 3, 5, 1, 8, 5, 10, 2]
grades = [15, 10, 7, 8, 11, 8, 17, 20]
df_days_calories = pd.DataFrame({'Subject': subjects, 'Stress': stress, 'Grade': grades})
ax = plt.gca()
df_days_calories.plot(x='Subject', y='Stress', ax=ax)
df_days_calories.plot(x='Subject', y='Grade', ax=ax)
plt.show()
EXAMPLE -3(string)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_squared_error
from math import sqrt
series = pd.read_csv(
'1_Daily_minimum_temps.csv',
header=0,
index_col=0,
parse_dates=True,
infer_datetime_format=True,
dayfirst=True,
na_values=['?', 'NA', '', 'null']
)
series = series.apply(pd.to_numeric, errors='coerce')
series.dropna(inplace=True)
X = series.values
train, test = X[:-7], X[-7:]
model = AutoReg(train, lags=30)
model_fit = model.fit()
print('Lag: %s' % model_fit.ar_lags)
print('Coefficients: %s' % model_fit.params)
predictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False)
for i in range(len(predictions)):
print('Predicted=%.3f, Expected=%.3f' % (predictions[i], test[i]))
rmse = sqrt(mean_squared_error(test, predictions))
print('Test RMSE: %.3f' % rmse)
plt.plot(test, label="Actual")
plt.plot(predictions, color='red', label="Predicted")
plt.legend()
plt.show()
EXAMPLE -3(numeric)
from pandas import read_csv
from matplotlib import pyplot as plt
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_squared_error
from math import sqrt
series = read_csv('daily-min-temperatures.csv', header=0, index_col=0, parse_dates=True)
X = series.values
train, test = X[:-7], X[-7:]
model = AutoReg(train, lags=30)
model_fit = model.fit()
print('Lag: %s' % model_fit.ar_lags)
print('Coefficients: %s' % model_fit.params)
predictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False)
for i in range(len(predictions)):
print('Predicted=%.3f, Expected=%.3f' % (predictions[i], test[i]))
rmse = sqrt(mean_squared_error(test, predictions))
print('Test RMSE: %.3f' % rmse)
plt.plot(test, label="Actual")
plt.plot(predictions, color='red', label="Predicted")
plt.legend()
plt.show()
EXAMPLE -3(Miss)
from pandas import read_csv
from matplotlib import pyplot
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_squared_error
from math import sqrt
series = read_csv('/content/daily-min-temperatures.csv', header=0,
index_col=0,parse_dates=True, squeeze=True)
X = series.values
train, test = X[1:len(X)-7], X[len(X)-7:]
model = AutoReg(train,30)
model_fit = model.fit()
print('Lag: %s' % model_fit.ar_lags)
print('Coefficients: %s' % model_fit.params)
predictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1,
dynamic=False)
for i in range(len(predictions)):
print('predicted=%f, expected=%f' % (predictions[i], test[i]))
rmse = sqrt(mean_squared_error(test, predictions))
print('Test RMSE: %.3f' % rmse)
pyplot.plot(test)
pyplot.plot(predictions, color='red')
pyplot.show()
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.ar_model import AutoReg
data = pd.read_csv("daily-min-temperatures.csv", index_col=0, parse_dates=True)
train, test = data[:-7], data[-7:]
model = AutoReg(train, lags=7).fit()
predictions = model.predict(start=len(train), end=len(train) + len(test) - 1)
plt.plot(test, label="Actual Temperature")
plt.plot(predictions, color='red', label="Predicted Temperature")
plt.xlabel("Time")
plt.ylabel("Temperature")
plt.title("Time Series Forecasting")
plt.legend()
plt.show()
Comments
Post a Comment