-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathURL.py
More file actions
49 lines (45 loc) · 1.76 KB
/
URL.py
File metadata and controls
49 lines (45 loc) · 1.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# @Time : 2021/7/4 11:16 PM
# @Author : zyc
# @File : URL.py
# @Title :
# @Description :
import os
import pandas as pd
import numpy as np
def loadData(csvFile):
pickleDump = '{}DroppedNaNCols.pickle'.format(csvFile)
if os.path.exists(pickleDump):
df = pd.read_pickle(pickleDump)
else:
df = pd.read_csv(csvFile, low_memory=False, na_values='NaN')
# print(np.inf)
# print(np.nan)
# df.replace([np.inf, -np.inf], np.nan)
# clean data
# strip the whitspaces from column names
df = df.rename(str.strip, axis='columns')
# drop Infinity rows and NaN string from each column
for col in df.columns:
test = df[col]
test2 = test == np.inf
# print('true_index_list', test2.index[test2])
# true_index_list = [i for i in test2.index if test2[i]]
# if not len(true_index_list):
# print('true_index_list:',true_index_list)
# Using Boolean Indexing
indexNames = df[df[col] == np.inf].index
if not indexNames.empty:
print('deleting {} rows with Infinity in column {}'.format(len(indexNames), col))
df.drop(indexNames, inplace=True)
df.argPathRatio = df['argPathRatio'].astype('float')
# drop all columns with NaN values
beforeColumns = df.shape[1]
df.dropna(axis='columns', inplace=True)
print('Dropped {} columns with NaN values'.format(beforeColumns - df.shape[1]))
# drop all rows with NaN values
beforeRows = df.shape[0]
df.dropna(inplace=True)
print('Dropped {} rows with NaN values'.format(beforeRows - df.shape[0]))
df.to_pickle(pickleDump)
return df
df = loadData('FinalDataset/ALL.csv')