Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,350 changes: 1,350 additions & 0 deletions notebooks/4.0-eo-initial-eurosat-data-exploration.ipynb

Large diffs are not rendered by default.

10 changes: 8 additions & 2 deletions references/ML_pipeline.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,14 @@
- Train images:269695
- Test images:125866
- Validation images:123723
- Balanced irrigation: 6606
- Balanced vineyards: 4758

- Balanced train images:13932
- Balanced test images:6604
- Balanced val images:6606

- Balanced vineyard train images:9790
- Balanced vineyard test images:4500
- Balanced vineyard val images:4758

#### EDA
# Integrating and Exploring the Combined MSI Dataset for California
Expand Down
1 change: 1 addition & 0 deletions src/data/bemodels
19 changes: 19 additions & 0 deletions src/data/count_tfrecords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import tensorflow as tf

print(pd.__version__)
print(tf.__version__)

tf_file_path_list = ['balanced_train.tfrecord', 'balanced_test.tfrecord','balanced_val.tfrecord',
'balanced_vy_train.tfrecord', 'balanced_vy_test.tfrecord', 'balanced_vy_val.tfrecord'
]
for csv_file in tf_file_path_list:
tf_records_filename = '/workspace/app/data/processed/'+csv_file
c = 0
for record in tf.python.python_io.tf_record_iterator(tf_records_filename):
c += 1
print (csv_file,c)

323 changes: 323 additions & 0 deletions src/data/make_data_eurosat.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,323 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#!/usr/bin/env python\n",
"# coding: utf-8"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import argparse\n",
"import csv\n",
"import json\n",
"import numpy as np\n",
"import os\n",
"import pandas as pd\n",
"import rasterio\n",
"import tensorflow as tf\n",
"from glob import glob\n",
"from tqdm import tqdm\n",
"from random import choices"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Path to the BigEarthNet extracted files\n",
"eurosat_path = '/workspace/app/data/raw/EuroSat/fulldata/'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Models folder is already checkin. No need to download the models\n",
"eurosat_folder = '/workspace/app/data/raw/eurosat-models/'"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Stores the TFRecords\n",
"out_folder = '/workspace/app/data/processed/EuroSat'"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists(eurosat_path):\n",
" print('folder', eurosat_path, 'does not exist')\n",
" print('Downloading Data...')\n",
" # Downloads the data from EuroSat website\n",
" # os.system(\"curl http://madm.dfki.de/files/sentinel/EuroSAT.zip -o /data/raw/eurosat_rgb.zip\")\n",
"# os.system(\"unzip /data/raw/eurosat_rgb.zip -d /data/raw\")\n",
"# os.mkdir(eurosat_path)\n",
"# os.rename(\"/data/raw/ds/..../*) "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists(eurosat_folder):\n",
" print('ERROR: folder', eurosat_folder, 'does not exist')\n",
" os.mkdir(eurosat_folder)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists(out_folder):\n",
" print('ERROR: folder', out_folder, 'does not exist')\n",
" os.mkdir(out_folder)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using Python Version: 1.1.5\n",
"Using TensorFlow Version: 2.3.0\n"
]
}
],
"source": [
"print(f'Using Python Version: {pd.__version__}')\n",
"print(f'Using TensorFlow Version: {tf.__version__}')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"cont = 0\n",
"label_list = os.listdir(eurosat_path)\n",
"label_indices = {'original_labels':{}}\n",
"for lbl in label_list:\n",
" label_indices['original_labels'][lbl] = cont\n",
" cont += 1"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def prep_example_eurosat(bands, original_labels, original_labels_multi_hot, patch_name):\n",
" return tf.train.Example(\n",
" features=tf.train.Features(\n",
" feature={\n",
" 'B01': tf.train.Feature(\n",
" int64_list=tf.train.Int64List(value=np.ravel(bands['B01']))),\n",
" 'B02': tf.train.Feature(\n",
" int64_list=tf.train.Int64List(value=np.ravel(bands['B02']))),\n",
" 'B03': tf.train.Feature(\n",
" int64_list=tf.train.Int64List(value=np.ravel(bands['B03']))),\n",
" 'B04': tf.train.Feature(\n",
" int64_list=tf.train.Int64List(value=np.ravel(bands['B04']))),\n",
" 'B05': tf.train.Feature(\n",
" int64_list=tf.train.Int64List(value=np.ravel(bands['B05']))),\n",
" 'B06': tf.train.Feature(\n",
" int64_list=tf.train.Int64List(value=np.ravel(bands['B06']))),\n",
" 'B07': tf.train.Feature(\n",
" int64_list=tf.train.Int64List(value=np.ravel(bands['B07']))),\n",
" 'B08': tf.train.Feature(\n",
" int64_list=tf.train.Int64List(value=np.ravel(bands['B08']))),\n",
" 'B8A': tf.train.Feature(\n",
" int64_list=tf.train.Int64List(value=np.ravel(bands['B8A']))),\n",
" 'B09': tf.train.Feature(\n",
" int64_list=tf.train.Int64List(value=np.ravel(bands['B09']))),\n",
" 'B11': tf.train.Feature(\n",
" int64_list=tf.train.Int64List(value=np.ravel(bands['B11']))),\n",
" 'B12': tf.train.Feature(\n",
" int64_list=tf.train.Int64List(value=np.ravel(bands['B12']))),\n",
" 'original_labels': tf.train.Feature(\n",
" bytes_list=tf.train.BytesList(\n",
" value=[i.encode('utf-8') for i in original_labels])),\n",
" 'original_labels_multi_hot': tf.train.Feature(\n",
" int64_list=tf.train.Int64List(value=original_labels_multi_hot)),\n",
" 'patch_name': tf.train.Feature(\n",
" bytes_list=tf.train.BytesList(value=[patch_name.encode('utf-8')]))\n",
" }))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
" Label: Pasture \n",
"\n",
"Folder Pasture exist already\n",
"1996/2000 [============================>.] - ETA: 0s\n",
" Label: Industrial \n",
"\n",
"Folder Industrial exist already\n",
"2499/2500 [============================>.] - ETA: 0s\n",
" Label: PermanentCrop \n",
"\n",
"Folder PermanentCrop exist already\n",
"2496/2500 [============================>.] - ETA: 0s\n",
" Label: AnnualCrop \n",
"\n",
"Folder AnnualCrop exist already\n",
"2998/3000 [============================>.] - ETA: 0s\n",
" Label: Highway \n",
"\n",
"Folder Highway exist already\n",
"2497/2500 [============================>.] - ETA: 0s\n",
" Label: HerbaceousVegetation \n",
"\n",
"Folder HerbaceousVegetation exist already\n",
"2996/3000 [============================>.] - ETA: 0s\n",
" Label: Residential \n",
"\n",
"Folder Residential exist already\n",
"2999/3000 [============================>.] - ETA: 0s\n",
" Label: Forest \n",
"\n",
"Folder Forest exist already\n",
"2997/3000 [============================>.] - ETA: 0s\n",
" Label: River \n",
"\n",
"Folder River exist already\n",
"2498/2500 [============================>.] - ETA: 0s\n",
" Label: SeaLake \n",
"\n",
"Folder SeaLake exist already\n",
"2998/3000 [============================>.] - ETA: 0sCompleted!!!\n"
]
}
],
"source": [
"bands_l =['B01','B02','B03','B04','B05','B06','B07','B08',\n",
" 'B09','B10','B11','B12','B8A']\n",
"\n",
"# Split options [Train = 0, Test= 1, Validation= 2]\n",
"split_option = [0,1,2]\n",
"# Split Probabilities [Train = 50%, Test= 25%, Validation= 25%]\n",
"weights = [0.5,0.25, 0.25]\n",
"\n",
"# TFRecords Writers\n",
"TFRec_writer_train = tf.io.TFRecordWriter(os.path.join(out_folder, 'train.tfrecord'))\n",
"TFRec_writer_test = tf.io.TFRecordWriter(os.path.join(out_folder, 'test.tfrecord'))\n",
"TFRec_writer_val = tf.io.TFRecordWriter(os.path.join(out_folder, 'val.tfrecord'))\n",
"patch_name = out_folder\n",
"for tifile in label_list:\n",
" # create Folder\n",
" print('\\n Label: {} \\n'.format(tifile))\n",
"\n",
" try:\n",
" os.mkdir(patch_name)\n",
" except:\n",
" print('Folder {} exist already'.format(tifile))\n",
" \n",
" # create labels\n",
" original_labels = [tifile]\n",
" \n",
" # hot encode label\n",
" original_labels_multi_hot = np.zeros(len(label_list),dtype=int)\n",
" lidx = label_indices['original_labels'][tifile]\n",
" original_labels_multi_hot[lidx] = 1\n",
"\n",
" # loop in the folder\n",
" files_list = os.listdir(os.path.join(eurosat_path,tifile))\n",
" \n",
" # write holder tfrecord\n",
" progress_bar = tf.keras.utils.Progbar(target = len(files_list))\n",
" for findex,fex in enumerate(files_list):\n",
" sname = fex.split('_')[1][:-4]\n",
" band_ds = rasterio.open(os.path.join(eurosat_path,tifile,fex))\n",
" bands = {}\n",
" for idx in range(13): # 13 bands\n",
" if bands_l[idx] == 'B10':\n",
" continue\n",
" bands[bands_l[idx]] = np.array(band_ds.read(idx+1))\n",
"\n",
" # prep example dev example\n",
" example = prep_example_eurosat(bands,\n",
" original_labels, \n",
" original_labels_multi_hot, \n",
" patch_name)\n",
" \n",
" pick = choices(split_option,weights)[0]\n",
" if pick == 0:\n",
" TFRec_writer_train.write(example.SerializeToString()) # This is the full path to tfrecord train\n",
" elif pick == 1:\n",
" TFRec_writer_test.write(example.SerializeToString()) # This is the full path to tfrecord test\n",
" else:\n",
" TFRec_writer_val.write(example.SerializeToString()) # This is the full path to tfrecord val \n",
" \n",
" progress_bar.update(findex)\n",
"\n",
"print('Completed!!!')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading