A5code

.py
# -*- coding: utf-8 -*- """ Created on Sun Apr 9 16:16:38 2023 @author: rfran """ import pandas as pd import math from scipy import stats #Constant for Alpha declared ALPHA = 0.05 #fetch the data into a frame datfile = 'C:\\Users\\rfran\\Desktop\\Concordia\\DataScience\\Assignment5\\Beach_Weather_Stat ions_-_Automated_Sensors.csv' frame = pd.read_csv(datfile) #remember original data size and drop data we don't need originalLen = len(frame) frame.drop(['Barometric Pressure','Wet Bulb Temperature','Rain Intensity','Interval Rain','Total Rain', 'Precipitation Type','Wind Direction','Wind Speed','Maximum Wind Speed','Solar Radiation','Heading', 'Battery Life','Measurement Timestamp Label','Measurement ID'],axis=1, inplace=True) frame = frame[(frame['Station Name'] == "63rd Street Weather Station") | (frame['Station Name'] == "Oak Street Weather Station") | (frame['Station Name'] == "Foster Weather Station")] #temperature should only be -30 to 40, so NaN for bad readings temps = frame['Air Temperature'] for i in range(0,len(frame)): if(temps[i]<-30 or temps[i] >40): frame.loc[i,['Air Temperature']] = 'NaN' #parse all but August 2017 data dateFrame = frame['Measurement Timestamp'].str.split(pat=' ',n=1,expand=True) splitFrame = dateFrame[0].str.split(pat='\/',expand=True) frame['Month']=splitFrame[0] frame['Year']=splitFrame[2] frame.drop(['Measurement Timestamp'],axis=1,inplace=True) frame = frame[frame['Month'].str.contains("8")==True] frame = frame[frame['Year'].str.contains("2017")==True] #scrubbing is complete, time for data analysis scrubbedLen = len(frame) print("Data has been scrubbed. Proceeding to analysis.") print("Original size:",originalLen) print("New size:",scrubbedLen) #calculate and display skew and kurtosis of the data, then reveal if the data is possibly normal kurtTest = stats.kurtosistest(frame['Air Temperature']) skewTest = stats.skewtest(frame['Air Temperature']) print("\nSample Skew=",skewTest[1],"\nSample Kurtosis=",kurtTest[1])
print("z-score from skewtest ",skewTest[0]) print("z-score from kurtosistest ",kurtTest[0]) normalTest = True normalCheck = "possibly normal, so use parametric technique" if(abs(skewTest[0])>2 or abs(kurtTest[0])>7): normalTest = False normalCheck = "non-normal, so use non-parametric technique" print("This data is",normalCheck) #get our separate data for the different stations frameOak = frame[(frame['Station Name'] == "Oak Street Weather Station")] frameSix = frame[(frame['Station Name'] == "63rd Street Weather Station")] frameFos = frame[(frame['Station Name'] == "Foster Weather Station")] #perform the ANOVA test after bartlett test bartTest = stats.bartlett(frameOak['Air Temperature'],frameSix['Air Temperature'],frameFos['Air Temperature']) print("Bartlett test:",bartTest[0],"\nP value:",bartTest[1]) if(bartTest[1]>ALPHA): print("Variance is homogenetic. Analysis should be reliable.") else: print("WARNING: varaince in the data is heterogenetic, so result analysis may not be reliable.") oneWayTest = stats.f_oneway(frameOak['Air Temperature'],frameSix['Air Temperature'],frameFos['Air Temperature']) #report findings print("63rd mean air temp:",round(frameSix['Air Temperature'].mean(),2),"data values:",len(frameSix)) print("Oak St. mean air temp:",round(frameOak['Air Temperature'].mean(),2),"data values:",len(frameOak)) print("Foster mean air temp:",round(frameFos['Air Temperature'].mean(),2),"data values:",len(frameFos)) print("F stat:",oneWayTest[0],"\nP value:",oneWayTest[1]) if(oneWayTest[1]>ALPHA): print("There was not significant difference in mean air temperature between the locations.") else: print("There was significant difference in mean air temperature between the locations.") #make a boxplot of the temperatures by location boxPlot = frame.boxplot(by = ['Station Name'],column = ['Air Temperature'],fontsize='small') print("Boxplot generated.") #START OF PART B print("\n=======PART B STARTS HERE=======\n") #fetch the data into a frame datfile = 'C:\\Users\\rfran\\Desktop\\Concordia\\DataScience\\Assignment5\\Beach_Weather_Stat ions_-_Automated_Sensors.csv' frame = pd.read_csv(datfile) #remember original data size and drop data we don't need originalLen = len(frame) frame.drop(['Barometric Pressure','Wet Bulb Temperature','Rain Intensity','Interval Rain','Total Rain', 'Precipitation Type','Wind Direction','Wind Speed','Maximum Wind
Speed','Solar Radiation','Heading', 'Battery Life','Measurement Timestamp Label','Measurement ID'],axis=1, inplace=True) frame = frame[(frame['Station Name'] == "63rd Street Weather Station") | (frame['Station Name'] == "Oak Street Weather Station") | (frame['Station Name'] == "Foster Weather Station")] #only Humidity from 0 to 100 is allowed #temperature should only be -30 to 40, so NaN for bad readings humid = frame['Humidity'] for i in range(0,len(frame)): if(humid[i]<0 or humid[i] >100): frame.loc[i,['Humidity']] = 'NaN' #scrubbing is complete, time for data analysis scrubbedLen = len(frame) print("Data has been scrubbed. Proceeding to analysis.") print("Original size:",originalLen) print("New size:",scrubbedLen) #quick check for normality #Perform quick test for normalcy stDev = frame['Humidity'].std() humidMean = frame['Humidity'].mean() maxValue = frame['Humidity'].max() minValue = frame['Humidity'].min() if(minValue < humidMean - stDev*3): print("Notable left skew.") elif(maxValue > humidMean + stDev*3): print("Notable right skew.") else: print("No notable skew from quick test.") #calculate and display skew and kurtosis of the data, then reveal if the data is possibly normal kurtTest = stats.kurtosistest(frame['Humidity']) skewTest = stats.skewtest(frame['Humidity']) print("\nSample Skew=",skewTest[1],"\nSample Kurtosis=",kurtTest[1]) print("z-score from skewtest ",skewTest[0]) print("z-score from kurtosistest ",kurtTest[0]) normalTest = True normalCheck = "possibly normal, so use parametric technique" if(abs(skewTest[0])>2 or abs(kurtTest[0])>7): normalTest = False normalCheck = "non-normal, so use non-parametric technique" print("This data is",normalCheck) #get our separate data for the different stations frameOak = frame[(frame['Station Name'] == "Oak Street Weather Station")] frameSix = frame[(frame['Station Name'] == "63rd Street Weather Station")] frameFos = frame[(frame['Station Name'] == "Foster Weather Station")] #perform the Kruskal-Wallis test after levene test levTest = stats.levene(frameOak['Humidity'],frameSix['Humidity'],frameFos['Humidity']) print("Levene Test:",levTest[0],"\nP value:",levTest[1]) if(levTest[1]>ALPHA): print("Variance is homogenetic. Analysis should be reliable.")
Page1of 4
Uploaded by ChefToadPerson616 on coursehero.com