Added nn6
6
.vscode/settings.json
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"workbench.editorAssociations": {
|
||||
"*.copilotmd": "vscode.markdown.preview.editor",
|
||||
"file:/**/*.csv": "jupyter-data-wrangler"
|
||||
}
|
||||
}
|
||||
@@ -1189,7 +1189,7 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "dsai",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@@ -1203,7 +1203,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.20"
|
||||
"version": "3.9.23"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -1524,7 +1524,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.20"
|
||||
"version": "3.9.23"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
126
06_NN/code/nn_10_cnn_2.ipynb
Normal file
@@ -0,0 +1,126 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "086b9495",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<div style=\"\n",
|
||||
" border: 2px solid #4CAF50; \n",
|
||||
" padding: 15px; \n",
|
||||
" background-color: #f4f4f4; \n",
|
||||
" border-radius: 10px; \n",
|
||||
" align-items: center;\">\n",
|
||||
"\n",
|
||||
"<h1 style=\"margin: 0; color: #4CAF50;\">Neural Networks: Convolutional Neural Networks (2)</h1>\n",
|
||||
"<h2 style=\"margin: 5px 0; color: #555;\">DSAI</h2>\n",
|
||||
"<h3 style=\"margin: 5px 0; color: #555;\">Jakob Eggl</h3>\n",
|
||||
"\n",
|
||||
"<div style=\"flex-shrink: 0;\">\n",
|
||||
" <img src=\"https://www.htl-grieskirchen.at/wp/wp-content/uploads/2022/11/logo_bildschirm-1024x503.png\" alt=\"Logo\" style=\"width: 250px; height: auto;\"/>\n",
|
||||
"</div>\n",
|
||||
"<p1> © 2025/26 Jakob Eggl. Nutzung oder Verbreitung nur mit ausdrücklicher Genehmigung des Autors.</p1>\n",
|
||||
"</div>\n",
|
||||
"<div style=\"flex: 1;\">\n",
|
||||
"</div> "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "640a7aba",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Nachdem wir jetzt wissen, wie ein *Convolutional Neuronal Network* (**CNN**) funktioniert, wollen wir nun nochmal die Datasets **MNIST** und **Fashion-MNIST** ausprobieren."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8d8125aa",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Erstelle somit ein Neuronales Netzwerk, welches auf der CNN Architektur basiert und auf den Datasets **MNIST** und **Fashion-MNIST** trainiert wird."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2b5f313f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Lösung"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6c231b6a",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "2a0a6fdf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import torch.optim as optim\n",
|
||||
"from torch.utils.data import DataLoader, Dataset, random_split\n",
|
||||
"from torchvision import datasets, transforms\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import numpy as np\n",
|
||||
"from sklearn.metrics import confusion_matrix\n",
|
||||
"import seaborn as sns"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "e942400c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"cpu\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"device = torch.device('cuda') if torch.cuda.is_available() else torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')\n",
|
||||
"print(device)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "86c04c79",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "dsai",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.23"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
938
06_NN/code/nn_10_cnn_2_solution.ipynb
Normal file
3506
06_NN/code/nn_1_numpy_pytorch.ipynb
Normal file
1581
06_NN/code/nn_2_mlp_activation_fct.ipynb
Normal file
1364
06_NN/code/nn_3_optimierung.ipynb
Normal file
1031
06_NN/code/nn_4_datasets_dataloader.ipynb
Normal file
1585
06_NN/code/nn_5_training_tipps_tricks.ipynb
Normal file
1194
06_NN/code/nn_6_deployment.ipynb
Normal file
70
06_NN/code/nn_6_flask_app.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import io
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torchvision.models as models
|
||||
import torchvision.transforms as transforms
|
||||
from PIL import Image
|
||||
from flask import Flask, request, jsonify
|
||||
import os
|
||||
import json
|
||||
import urllib.request
|
||||
|
||||
|
||||
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
|
||||
print(device)
|
||||
|
||||
model = models.resnet18(weights = models.ResNet18_Weights.IMAGENET1K_V1)
|
||||
model.eval()
|
||||
model.to(device)
|
||||
|
||||
transform = transforms.Compose([
|
||||
transforms.Resize(256),
|
||||
transforms.CenterCrop(224),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(
|
||||
mean=[0.485, 0.456, 0.406], # ImageNet Normalisierung
|
||||
std=[0.229, 0.224, 0.225]
|
||||
)
|
||||
])
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route("/predict", methods=["POST"])
|
||||
def predict():
|
||||
"""
|
||||
Erwartet eine Bilddatei ('image') im POST-Request.
|
||||
Beispiel (Postman): POST -> http://127.0.0.1:5665/predict
|
||||
Body -> form-data -> key='image', value=<Bilddatei>
|
||||
"""
|
||||
if "image" not in request.files:
|
||||
return jsonify({"error": "Kein Bild hochgeladen!"}), 400
|
||||
|
||||
file = request.files["image"]
|
||||
img_bytes = file.read()
|
||||
|
||||
try:
|
||||
image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
|
||||
except Exception as e:
|
||||
return jsonify({"error": f"Fehler beim Öffnen des Bildes: {str(e)}"}), 400
|
||||
|
||||
# Preprocessing
|
||||
input_tensor = transform(image).unsqueeze(0).to(device)
|
||||
|
||||
# Inferenz
|
||||
with torch.no_grad():
|
||||
outputs = model(input_tensor)
|
||||
_, predicted = outputs.max(1)
|
||||
|
||||
# Ergebnis zurückgeben
|
||||
return jsonify({
|
||||
"predicted_class_index": predicted.item()
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0", port=5665, debug=True)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
867
06_NN/code/nn_8_example_classification_solution.ipynb
Normal file
@@ -0,0 +1,867 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2580d14d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<div style=\"\n",
|
||||
" border: 2px solid #4CAF50; \n",
|
||||
" padding: 15px; \n",
|
||||
" background-color: #f4f4f4; \n",
|
||||
" border-radius: 10px; \n",
|
||||
" align-items: center;\">\n",
|
||||
"\n",
|
||||
"<h1 style=\"margin: 0; color: #4CAF50;\">Neural Networks: Ein Beispiel (Klassifikation) (Lösung)</h1>\n",
|
||||
"<h2 style=\"margin: 5px 0; color: #555;\">DSAI</h2>\n",
|
||||
"<h3 style=\"margin: 5px 0; color: #555;\">Jakob Eggl</h3>\n",
|
||||
"\n",
|
||||
"<div style=\"flex-shrink: 0;\">\n",
|
||||
" <img src=\"https://www.htl-grieskirchen.at/wp/wp-content/uploads/2022/11/logo_bildschirm-1024x503.png\" alt=\"Logo\" style=\"width: 250px; height: auto;\"/>\n",
|
||||
"</div>\n",
|
||||
"<p1> © 2025/26 Jakob Eggl. Nutzung oder Verbreitung nur mit ausdrücklicher Genehmigung des Autors.</p1>\n",
|
||||
"</div>\n",
|
||||
"<div style=\"flex: 1;\">\n",
|
||||
"</div> "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e1a0eaf8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wir wollen nun auch ein neuronales Netzwerk für die Klassifizierung bauen. Dabei wollen wir ein sehr bekanntes Dataset verwenden (MNIST). Es gibt es in vielen Variationen (zum Beispiel auch mit Kleidung (Fashion-MNIST)) und ist gratis. \n",
|
||||
"\n",
|
||||
"Zuerst wollen wir das normale MNIST Dataset verwenden. Es beinhaltet die handgeschriebenen Zahlen von $0$ bis $9$. Ziel ist es die richtige Zahl zu erkennen."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cc8846a2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"(von https://de.wikipedia.org/wiki/MNIST-Datenbank)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "51c9fecc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Insgesamt hat das MNIST Dataset $60\\mathrm k$ Trainingsbilder und $10\\mathrm k$ Testbilder. Die Klassen sind dabei ziemlich gleichverteilt, sprich es gibt in etwa gleich viele Bilder mit Label \"1\", Label \"2\", usw."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e054c9dc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Lösung"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7b7c7bd4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Zu Beginn wollen wir sicherstellen, dass jede und jeder das MNIST Dataset heruntergeladen hat. Der Pfad der folgenden Methode kann, wenn nötig, angepasst werden."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5d2fdca5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import torch.optim as optim\n",
|
||||
"from torch.utils.data import DataLoader, Dataset, random_split\n",
|
||||
"from torchvision import datasets, transforms\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import numpy as np\n",
|
||||
"from sklearn.metrics import confusion_matrix\n",
|
||||
"import seaborn as sns"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "38992064",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_path = os.path.join(\"..\", \"..\", \"_data\", \"mnist_data\")\n",
|
||||
"\n",
|
||||
"train_dataset = datasets.MNIST(root=data_path, train=True, download=True, transform=transforms.ToTensor()) # ToTensor makes images [0, 1] instead of {1,2,...,255}\n",
|
||||
"test_dataset = datasets.MNIST(root=data_path, train=False, download=True, transform=transforms.ToTensor())\n",
|
||||
"\n",
|
||||
"test_size = len(test_dataset) // 2\n",
|
||||
"valid_size = len(test_dataset) - test_size\n",
|
||||
"\n",
|
||||
"test_dataset, valid_dataset = random_split(test_dataset, [test_size, valid_size])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7e4570f5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Mit der obigen Methode haben wir direkt ein Torch Dataset erhalten und müssen nur mehr später den Dataloader erstellen."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2fae606c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Kurze **Wiederholung**: *Wie erstellt man sein eigenes Dataset*?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3504b37b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Zum Beispiel so:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2c5ae338",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class MyDataSetThatIsNeverUsed(Dataset): \n",
|
||||
" def __init__(self, transform=None):\n",
|
||||
" super().__init__()\n",
|
||||
" self.transform = transform\n",
|
||||
"\n",
|
||||
" def __len__(self):\n",
|
||||
" return 0\n",
|
||||
"\n",
|
||||
" def __getitem__(self, idx):\n",
|
||||
" # here is place for the transformation. Returns input and label\n",
|
||||
" return torch.tensor([]), torch.tensor(0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1068fe47",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Ansonsten starten wir wieder mit dem device (Prinzipiell eine gute Gewohnheit, dies einmalig am Anfang zu definieren)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3e5a91e4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"device = torch.device('cuda') if torch.cuda.is_available() else torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')\n",
|
||||
"print(device)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b992eade",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Nachdem wir die Datasets schon haben, wollen wir nun die Dataloader definieren."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "01f002c0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"batch_size = 64\n",
|
||||
"\n",
|
||||
"train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n",
|
||||
"test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)\n",
|
||||
"valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6d262f12",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wir wollen uns nun auch noch ein paar Bilder aus dem Trainingsset ansehen."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "661246a3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"examples = enumerate(train_loader)\n",
|
||||
"batch_idx, (example_data, example_targets) = next(examples)\n",
|
||||
"\n",
|
||||
"plt.figure(figsize=(8, 3))\n",
|
||||
"for i in range(6):\n",
|
||||
" plt.subplot(1, 6, i+1)\n",
|
||||
" plt.tight_layout()\n",
|
||||
" plt.imshow(example_data[i][0], cmap='gray', interpolation='none')\n",
|
||||
" plt.title(f\"{example_targets[i]}\")\n",
|
||||
" plt.xticks([])\n",
|
||||
" plt.yticks([])\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c14a7e3c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Als nächstes definieren wir uns das Netzwerk. Auf was müssen wir nun acht geben im Vergleich zur Regression?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e2d9cd26",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class MNISTClassifier(nn.Module):\n",
|
||||
" def __init__(self):\n",
|
||||
" super().__init__()\n",
|
||||
" self.layers = nn.Sequential(\n",
|
||||
" nn.Flatten(), # Very important! Why? -> We will see that for CNN's we don't need this flattening!\n",
|
||||
" nn.Linear(28*28, 256),\n",
|
||||
" nn.ReLU(),\n",
|
||||
" nn.Linear(256, 128),\n",
|
||||
" nn.ReLU(),\n",
|
||||
" nn.Linear(128,64),\n",
|
||||
" nn.ReLU(),\n",
|
||||
" nn.Linear(64, 10),\n",
|
||||
" )\n",
|
||||
" def forward(self, x):\n",
|
||||
" return self.layers(x)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4e413ca3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = MNISTClassifier().to(device)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cf9deca1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Welchen Loss wollen wir verwenden? Welchen Optimizer?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fe340898",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"lr = 0.001\n",
|
||||
"\n",
|
||||
"criterion = nn.CrossEntropyLoss()\n",
|
||||
"optimizer = optim.Adam(model.parameters(), lr=lr)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2b93a21c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Kommen wir nun zur Trainingsmethode. Wir machen diese dieses Mal als eigene Methode. Ebenso machen wir das mit der Evaluierungsmethode. (Grund für die umgekehrte Reihenfolge ist, weil die Trainingsmethode eine Evaluierungsmethode beinhaltet.)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4da656f8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def evaluate_model(model, data_loader, criterion):\n",
|
||||
" model.eval()\n",
|
||||
" loss_total = 0.0\n",
|
||||
" correct = 0\n",
|
||||
" total = 0\n",
|
||||
" \n",
|
||||
" with torch.no_grad():\n",
|
||||
" for data, target in data_loader:\n",
|
||||
" data, target = data.to(device), target.to(device)\n",
|
||||
" output = model(data)\n",
|
||||
" loss = criterion(output, target)\n",
|
||||
" loss_total += loss.item() * data.size(0)\n",
|
||||
" \n",
|
||||
" _, predicted = torch.max(output.data, 1)\n",
|
||||
" total += target.size(0)\n",
|
||||
" correct += (predicted == target).sum().item()\n",
|
||||
" \n",
|
||||
" avg_loss = loss_total / total\n",
|
||||
" accuracy = 100.0 * correct / total\n",
|
||||
" return avg_loss, accuracy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "be0815fe",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def train_model(model, train_loader, valid_loader, criterion, optimizer, save_path:str=None,\n",
|
||||
" epochs=20, validate_at=1, print_at=100, patience=3):\n",
|
||||
" \n",
|
||||
" if save_path is None:\n",
|
||||
" save_path = os.path.join(\"..\", \"models\", \"nn_8_best_model.pth\")\n",
|
||||
"\n",
|
||||
" best_loss = float(\"inf\")\n",
|
||||
" patience_counter = 0\n",
|
||||
"\n",
|
||||
" for epoch in range(1, epochs+1):\n",
|
||||
" model.train()\n",
|
||||
" running_loss = 0.0\n",
|
||||
"\n",
|
||||
" for batch_idx, (data, target) in enumerate(train_loader):\n",
|
||||
" data, target = data.to(device), target.to(device)\n",
|
||||
"\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" output = model(data)\n",
|
||||
" loss = criterion(output, target)\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
" \n",
|
||||
" running_loss += loss.item()\n",
|
||||
" \n",
|
||||
" if (batch_idx+1) % print_at == 0:\n",
|
||||
" print(f\"Epoch [{epoch}/{epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}\")\n",
|
||||
"\n",
|
||||
" if epoch % validate_at == 0:\n",
|
||||
" val_loss, val_acc = evaluate_model(model, valid_loader, criterion)\n",
|
||||
" print(f\"Epoch [{epoch}/{epochs}] - Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.2f}%\")\n",
|
||||
"\n",
|
||||
" if val_loss < best_loss:\n",
|
||||
" best_loss = val_loss\n",
|
||||
" patience_counter = 0\n",
|
||||
" torch.save(model.state_dict(), save_path)\n",
|
||||
" print(f\">>> Found a better model and saved it at '{save_path}'\")\n",
|
||||
" else:\n",
|
||||
" patience_counter += 1\n",
|
||||
" print(f\"No Improvement. Early Stopping Counter: {patience_counter}/{patience}\")\n",
|
||||
" if patience_counter >= patience:\n",
|
||||
" print(\"Early Stopping triggered.\")\n",
|
||||
" break"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3afc2ae3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Last but not least wollen wir nun das Modell trainieren. Dazu definieren wir uns die Hyperparameter zuerst (manche sind der Form halber jetzt doppelt)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "32230cc3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"### HYPERPARAMETER ###\n",
|
||||
"\n",
|
||||
"model = MNISTClassifier().to(device)\n",
|
||||
"criterion = nn.CrossEntropyLoss()\n",
|
||||
"lr = 0.001\n",
|
||||
"optimizer = optim.Adam(model.parameters(), lr=lr)\n",
|
||||
"epochs = 20\n",
|
||||
"validate_at = 1\n",
|
||||
"print_at = 200\n",
|
||||
"early_stopping_patience = 3\n",
|
||||
"save_path = os.path.join(\"..\", \"models\", \"nn_8_best_model_mnist.pth\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "453d2b87",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_model(model, train_loader, valid_loader, criterion, optimizer, epochs=epochs, validate_at=validate_at, print_at=print_at, patience=early_stopping_patience, save_path=save_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fd829890",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Am Schluss evaluieren wir noch das beste Modell:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d288c09d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.load_state_dict(torch.load(save_path))\n",
|
||||
"test_loss, test_acc = evaluate_model(model, test_loader, criterion)\n",
|
||||
"print(f\"Finaler Test Loss: {test_loss:.4f}\")\n",
|
||||
"print(f\"Finale Test Accuracy: {test_acc:.2f}%\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "60aa40ec",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"___"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "35559661",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Sind wir zufrieden? Was könnte man verbessern?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e2a63701",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Man könnte eine (andere) Transformation verwenden."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cc538713",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Berechnen wir dazu mal den Mean und die Varianz (bzw. Standardabweichung der Trainingsdaten)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "40ad9243",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mean = 0.\n",
|
||||
"std = 0.\n",
|
||||
"for imgs, _ in train_loader:\n",
|
||||
" mean += imgs.mean()\n",
|
||||
" std += imgs.std()\n",
|
||||
"\n",
|
||||
"mean /= len(train_loader)\n",
|
||||
"std /= len(train_loader)\n",
|
||||
"print(mean.item(), std.item())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f1cf0477",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_transform = transforms.Compose([\n",
|
||||
" transforms.RandomRotation(10), # small data augmentation\n",
|
||||
" transforms.ToTensor(),\n",
|
||||
" transforms.Normalize((0.1307,), (0.3081,))\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"test_transform = transforms.Compose([\n",
|
||||
" transforms.ToTensor(),\n",
|
||||
" transforms.Normalize((0.1307,), (0.3081,))\n",
|
||||
"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e7d0e06f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_path = os.path.join(\"..\", \"..\", \"_data\", \"mnist_data\")\n",
|
||||
"\n",
|
||||
"train_dataset = datasets.MNIST(root=data_path, train=True, download=True, transform=train_transform) # ToTensor makes images [0, 1] instead of {1,2,...,255}\n",
|
||||
"test_dataset = datasets.MNIST(root=data_path, train=False, download=True, transform=test_transform)\n",
|
||||
"\n",
|
||||
"test_size = len(test_dataset) // 2\n",
|
||||
"valid_size = len(test_dataset) - test_size\n",
|
||||
"\n",
|
||||
"test_dataset, valid_dataset = random_split(test_dataset, [test_size, valid_size])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2f8f37e5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"batch_size = 64\n",
|
||||
"\n",
|
||||
"train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n",
|
||||
"test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)\n",
|
||||
"valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cf9c9c5c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"### HYPERPARAMETER ###\n",
|
||||
"\n",
|
||||
"model = MNISTClassifier().to(device)\n",
|
||||
"criterion = nn.CrossEntropyLoss()\n",
|
||||
"lr = 0.001\n",
|
||||
"optimizer = optim.Adam(model.parameters(), lr=lr)\n",
|
||||
"epochs = 10\n",
|
||||
"validate_at = 1\n",
|
||||
"print_at = 200\n",
|
||||
"early_stopping_patience = 3\n",
|
||||
"save_path = os.path.join(\"..\", \"models\", \"nn_8_best_model_mnist_transform.pth\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eab17123",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_model(model, train_loader, valid_loader, criterion, optimizer, epochs=epochs, validate_at=validate_at, print_at=print_at, patience=early_stopping_patience, save_path=save_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "247a1257",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.load_state_dict(torch.load(save_path))\n",
|
||||
"test_loss, test_acc = evaluate_model(model, test_loader, criterion)\n",
|
||||
"print(f\"Finaler Test Loss: {test_loss:.4f}\")\n",
|
||||
"print(f\"Finale Test Accuracy: {test_acc:.2f}%\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7e0cfe7b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Man könnte nun natürlich auch noch weitere Epochen, ein noch größeres Netzwerk, andere Learning Rate, anderer Optimierer etc. verwenden. Wir sehen aber davon ab."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2c35fdc1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"___"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ddb2a810",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wir verwenden jetzt das **Fashion-MNIST** Dataset und führen alles nochmal aus."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6b8e57aa",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Es besteht nun aus Kleidungsstücken und dazugehörig 10 Labels. Wir müssen also unser Modell in erster Linie nicht anpassen. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "06c551b9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Auch hier gibt es $60\\, \\mathrm k$ Trainingsbilder und $10\\, \\mathrm k$ Testbilder."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f80dc124",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wir kopieren nun die wichtigsten Dinge und ändern sie leicht ab."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "06ed22e0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_path = os.path.join(\"..\", \"..\", \"_data\", \"fashion_mnist_data\")\n",
|
||||
"\n",
|
||||
"train_dataset = datasets.FashionMNIST(root=data_path, train=True, download=True, transform=transforms.ToTensor()) # ToTensor makes images [0, 1] instead of {1,2,...,255}\n",
|
||||
"test_dataset = datasets.FashionMNIST(root=data_path, train=False, download=True, transform=transforms.ToTensor())\n",
|
||||
"\n",
|
||||
"test_size = len(test_dataset) // 2\n",
|
||||
"valid_size = len(test_dataset) - test_size\n",
|
||||
"\n",
|
||||
"test_dataset, valid_dataset = random_split(test_dataset, [test_size, valid_size])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "11e3c83a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"batch_size = 64\n",
|
||||
"\n",
|
||||
"train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n",
|
||||
"test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)\n",
|
||||
"valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dac37e73",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"examples = enumerate(train_loader)\n",
|
||||
"batch_idx, (example_data, example_targets) = next(examples)\n",
|
||||
"\n",
|
||||
"label_dict = {\n",
|
||||
" 0: \"T-Shirt\",\n",
|
||||
" 1: \"Trouser\",\n",
|
||||
" 2: \"Pullover\",\n",
|
||||
" 3: \"Dress\",\n",
|
||||
" 4: \"Coat\",\n",
|
||||
" 5: \"Sandal\",\n",
|
||||
" 6: \"Shirt\",\n",
|
||||
" 7: \"Sneaker\",\n",
|
||||
" 8: \"Bag\",\n",
|
||||
" 9: \"Ankle Boot\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"plt.figure(figsize=(8, 3))\n",
|
||||
"for i in range(6):\n",
|
||||
" plt.subplot(1, 6, i+1)\n",
|
||||
" plt.tight_layout()\n",
|
||||
" plt.imshow(example_data[i][0], cmap='gray', interpolation='none')\n",
|
||||
" plt.title(f\"{label_dict[example_targets[i].item()]}\")\n",
|
||||
" plt.xticks([])\n",
|
||||
" plt.yticks([])\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7950b546",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wir verwenden nun das gleiche Modell wie vorher, ändern aber den Klassennamen."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "25451f9e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class FashionMNISTClassifier(nn.Module):\n",
|
||||
" def __init__(self):\n",
|
||||
" super().__init__()\n",
|
||||
" self.layers = nn.Sequential(\n",
|
||||
" nn.Flatten(), # Very important! Why?\n",
|
||||
" nn.Linear(28*28, 256),\n",
|
||||
" nn.ReLU(),\n",
|
||||
" nn.Linear(256, 128),\n",
|
||||
" nn.ReLU(),\n",
|
||||
" nn.Linear(128,64),\n",
|
||||
" nn.ReLU(),\n",
|
||||
" nn.Linear(64, 10),\n",
|
||||
" )\n",
|
||||
" def forward(self, x):\n",
|
||||
" return self.layers(x)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e60d3519",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Nun trainieren wir das Modell."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "12b4a9d7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"### HYPERPARAMETER ###\n",
|
||||
"\n",
|
||||
"model = FashionMNISTClassifier().to(device)\n",
|
||||
"criterion = nn.CrossEntropyLoss()\n",
|
||||
"lr = 0.001\n",
|
||||
"optimizer = optim.Adam(model.parameters(), lr=lr)\n",
|
||||
"epochs = 20\n",
|
||||
"validate_at = 1\n",
|
||||
"print_at = 200\n",
|
||||
"early_stopping_patience = 3\n",
|
||||
"save_path = os.path.join(\"..\", \"models\", \"nn_8_best_model_fashion_mnist.pth\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eecd887f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_model(model, train_loader, valid_loader, criterion, optimizer, epochs=epochs, validate_at=validate_at, print_at=print_at, patience=early_stopping_patience, save_path=save_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b63239a2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.load_state_dict(torch.load(save_path))\n",
|
||||
"test_loss, test_acc = evaluate_model(model, test_loader, criterion)\n",
|
||||
"print(f\"Finaler Test Loss: {test_loss:.4f}\")\n",
|
||||
"print(f\"Finale Test Accuracy: {test_acc:.2f}%\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cfb071d1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Diese Performance ist nicht wirklich gut. Für 10 Klassen bedeutet das, dass wir im Mittel 1 von 10 Klassen falsch zuordnen. Wir betrachten noch kurz die Confusion-Matrix."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f4be3ab7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Confusion Matrix of FashionMNIST model\n",
|
||||
"\n",
|
||||
"test_data = test_loader.dataset.dataset.data[test_loader.dataset.indices]\n",
|
||||
"test_targets = test_loader.dataset.dataset.targets[test_loader.dataset.indices]\n",
|
||||
"\n",
|
||||
"pred = model(test_data.unsqueeze(1).float().to(device))\n",
|
||||
"cm = confusion_matrix(test_targets.cpu(), pred.argmax(dim=1).cpu())\n",
|
||||
"\n",
|
||||
"plt.figure(figsize=(8, 6))\n",
|
||||
"sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_dict.values(), yticklabels=label_dict.values())\n",
|
||||
"plt.title(\"Confusion Matrix for FashionMNIST Classification\")\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2f6df1f8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"(zur Erinnerung, $y$-Achse entspricht der Ground-Truth und $x$-Achse entspricht der Vorhersage)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c792fa9a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ef946e2e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Reicht also immer ein Fully-Connected Neuronal Netzwerk aus?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f438ffcc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Nein!**\n",
|
||||
"\n",
|
||||
"Es gibt viele Probleme, die andere Architekturen erwarten. Auch, wenn man in gewissen Situationen vielleicht mit so einer Performance zufrieden ist, werden wir, insbesondere, wenn wir uns später zum Beispiel der **Image-Inpainting** Challenge widmen, sehen, dass wir andere Architekturen brauchen, da diese viel besser funktionieren."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "43a29933",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Als nächstes werden wir also eine neue Architektur kennenlernen, welche mit Bildern noch viel besser umgehen kann, als Feed-Forward Neuronal Netzwerke. Die Rede ist von sogenannten ***CNN's*** (*Convolutional Neuronal Networks*)."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "dsai",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
822
06_NN/code/nn_9_cnn_1.ipynb
Normal file
@@ -0,0 +1,822 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a83eef2d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<div style=\"\n",
|
||||
" border: 2px solid #4CAF50; \n",
|
||||
" padding: 15px; \n",
|
||||
" background-color: #f4f4f4; \n",
|
||||
" border-radius: 10px; \n",
|
||||
" align-items: center;\">\n",
|
||||
"\n",
|
||||
"<h1 style=\"margin: 0; color: #4CAF50;\">Neural Networks: Convolutional Neural Networks (1)</h1>\n",
|
||||
"<h2 style=\"margin: 5px 0; color: #555;\">DSAI</h2>\n",
|
||||
"<h3 style=\"margin: 5px 0; color: #555;\">Jakob Eggl</h3>\n",
|
||||
"\n",
|
||||
"<div style=\"flex-shrink: 0;\">\n",
|
||||
" <img src=\"https://www.htl-grieskirchen.at/wp/wp-content/uploads/2022/11/logo_bildschirm-1024x503.png\" alt=\"Logo\" style=\"width: 250px; height: auto;\"/>\n",
|
||||
"</div>\n",
|
||||
"<p1> © 2025/26 Jakob Eggl. Nutzung oder Verbreitung nur mit ausdrücklicher Genehmigung des Autors.</p1>\n",
|
||||
"</div>\n",
|
||||
"<div style=\"flex: 1;\">\n",
|
||||
"</div> "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "83a52169",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Nachdem wir bisher nur von `Linear`-Layers gesprochen haben, möchten wir uns jetzt auf eine **neue Architektur** fokusieren. Diese ist besonders geeignet für Bilder und hat eine gravierend andere funktionsweise als bisherige Layers.\n",
|
||||
"\n",
|
||||
"Die Rede ist von sogenannten **Convolution**-Layers, welche dann die sogenannten Convolutional Neural Networks (CNN's) bilden.\n",
|
||||
"\n",
|
||||
"In diesem Notebook wollen wir:\n",
|
||||
"1) Zuerst die Convolution-Operation kennenlernen und deren Anwendung in der klassischen Bildbearbeitung betrachten bevor wir\n",
|
||||
"2) Uns das Convolution-Layer in PyTorch ansehen und uns am Schluss\n",
|
||||
"3) Über die weiteren damit verbundenen Layers wie Pooling befassen."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f13607fc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Hinweis:** *Convolution* ist der englische Begriff für *Faltung*."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8dd66ee2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Hinweis:** Die Faltung ist eine mathematische Operation, welche [hier](https://de.wikipedia.org/wiki/Faltung_(Mathematik)) nachgelesen werden *kann*."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cef6ffb9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Die Convolution-Operation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "27ba7dd3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Zu Beginn wollen wir uns die Faltungsoperation auf Bildern ansehen.\n",
|
||||
"\n",
|
||||
"In diesem Notebook haben wir dabei immer folgendes Setup:\n",
|
||||
"* Wir haben ein Bild, sprich mathematisch gesehen einen $(3, \\text{Hoehe}, \\text{Breite})$ Tensor, also dreimal eine Matrix der Größe $\\text{Hoehe} \\times \\text{Breite}$.\n",
|
||||
"* Wir haben einen Filter $K$ (auch Kernel genannt), mathematisch gesehen ist das auch ein Tensor, der Einfachkeit halber (und das trifft auch meistens zu) denken wir einfach an eine Matrix.\n",
|
||||
"* Wir wollen den Filter auf unser Bild anwenden und dabei ein neues Bild, also eine neue Matrix generieren."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e14cbfd5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Betrachten wir hier zuerst mal ein paar Beispiele."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8c32d03d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Mario_Convolution.png\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von https://www.youtube.com/watch?v=KuXjwB4LzSA)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f1c431d9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Kirby_Convolution.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von https://www.youtube.com/watch?v=KuXjwB4LzSA)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8ba45f6c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Hier ist bei beiden Bildern rechts oben bzw. auch links das große Bild das Ausgangsbild. Rechts oben (blau umrahmt) ist der Filter zu sehen.\n",
|
||||
"\n",
|
||||
"Das Ergebnis der Faltungsoperation (Convolution) ist unten rechts zu sehen:\n",
|
||||
"Im Super-Mario Bild ist das ein Unschärfe Filter, im zweiten Bild mehr oder weniger das Gegenteil, sprich ein Filter zum Schärfen des Bildes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1d2ec712",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Aber was passiert da genau?**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8fcde416",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wir betrachten dazu ein **schwarz-weiß Bild**, also eine Matrix als unseren Input.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"In unserem Fall ist das die **Matrix**:\n",
|
||||
"$$X=\\begin{pmatrix}\n",
|
||||
" 0 & 0 & 0 & 0 & 0 & 0\\\\\n",
|
||||
" 0 & 1.0 & 0 & 0 & 0.4 & 0\\\\\n",
|
||||
" 0 & 0 & 0 & 0.4 & 0 & 0\\\\\n",
|
||||
" 0 & 0 & 0 & 0 & 0.4 & 0\\\\\n",
|
||||
" 0 & 0 & 0 & 0.4 & 0 & 0\\\\\n",
|
||||
" 0 & 0 & 0 & 0 & 0.4 & 0\n",
|
||||
"\\end{pmatrix}$$\n",
|
||||
"\n",
|
||||
"Dazu wollen wir als **Kernel (=Filter)** folgende Matrix verwenden\n",
|
||||
"\n",
|
||||
"$$K = \\begin{pmatrix}\n",
|
||||
" 0 & 0 & 0.5 \\\\\n",
|
||||
" 0 & 0.5 & 0 \\\\\n",
|
||||
" 0 & 0 & 0.5\n",
|
||||
"\\end{pmatrix}$$"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "688d77f4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Bei der Faltung wird nun der Filter über den Input $X$ gelegt, und das in jeder möglichen Kombination.\n",
|
||||
"\n",
|
||||
"Das sieht in unserem Fall dann so aus für den ersten Eintrag des Ergebnisses."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "945e1193",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Convolution_Concept1.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "54827b86",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Was ist hier passiert?** Wir haben den Filter (rot) über unser Bild (blau) gelegt und danach einfach Elementweise Matrix-Multipliziert und das Ergebnis zusammen gezählt.\n",
|
||||
"\n",
|
||||
"Dies wiederholen wir jetzt auch für die anderen Möglichkeiten. Also wir shiften unseren Kern um 1 Spalte nach rechts."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e9e68bb1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Convolution_Concept2.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2551d80d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Dies wiederholen wir jetzt solange, bis wir alle Möglichkeiten durch haben."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bdf22f6b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Convolution_Concept3.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "03f12803",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Convolution_Concept4.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "029b40ce",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Convolution_Concept5.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7c0443f6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Convolution_Concept6.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "baee7070",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Convolution_Concept7.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d2f31397",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Wichtig:** Wie wir sehen ist also die Faltung quasi (mathematisch sehr ungenau) wie ein 2D-Skalarprodukt."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3e3fa120",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Somit können wir auch erklären, warum beim Super-Mario Bild vorher ein unscharfer Output erzeugt wird. Der Grund ist, weil einfach der neue Pixel-Wert der Durchschnitt von allen umliegenden Pixelwerten ist."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1eaf4b05",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Wichtig:** Beim Super-Mario und Kirby Bild ist die Faltung jeweils auf jeden Channel ausgeführt worden. Dabei wurde jedes mal der gleiche Filter (Kernel) verwendet."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cc828e9e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wir wollen nun ein paar weitere Filter ausprobieren bei eigenen Bildern. Dazu nutzen wir die `cv2` Bibliothek. Das geht (falls noch nicht vorhanden mit dem `pip install opencv-python` Command)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cc849a62",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import cv2\n",
|
||||
"import matplotlib\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import numpy as np\n",
|
||||
"import os\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7b743c95",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"image_path = os.path.join(\"..\", \"resources\", \"Chemnitz_Hauptplatz.jpg\")\n",
|
||||
"image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)\n",
|
||||
"\n",
|
||||
"fig, ax = plt.subplots(figsize=(3, 6))\n",
|
||||
"ax.imshow(image, cmap=\"gray\", vmin=0, vmax=255)\n",
|
||||
"\n",
|
||||
"ax.grid(False)\n",
|
||||
"ax.set_xticks([])\n",
|
||||
"ax.set_yticks([])\n",
|
||||
"\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "39e9c012",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Nun definieren wir unsere Kernel. Beispiele sind zum Beispiel die Sobel Filter."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "47df8c04",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sobel_x = np.array([\n",
|
||||
" [ -1, 0, 1],\n",
|
||||
" [ -2, 0, 2],\n",
|
||||
" [ -1, 0, 1]\n",
|
||||
"])\n",
|
||||
"sobel_y = np.array([\n",
|
||||
" [ -1, -2, -1],\n",
|
||||
" [ 0, 0, 0],\n",
|
||||
" [ 1, 2, 1]\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"mean_blur = 1/400 * np.ones((20, 20))\n",
|
||||
"\n",
|
||||
"edge_detection = sobel_x + sobel_y\n",
|
||||
"\n",
|
||||
"gaussian_blur = 1/36 * np.array([\n",
|
||||
" [1,4,1],\n",
|
||||
" [4,16,4],\n",
|
||||
" [1,4,1]\n",
|
||||
"]) \n",
|
||||
"\n",
|
||||
"filter = edge_detection\n",
|
||||
"\n",
|
||||
"filtered_image = cv2.filter2D(image, -1, filter)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"fig, ax = plt.subplots(figsize=(3, 6))\n",
|
||||
"ax.imshow(filtered_image, cmap=\"gray\", vmin=0, vmax=255)\n",
|
||||
"ax.grid(False)\n",
|
||||
"ax.set_xticks([])\n",
|
||||
"ax.set_yticks([])\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8b70388a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **Übung:** Suche im Internet nach dem **Edge-Detection**, **Mean-Blurr**, **Gaussian-Blurr** und einem Schärfefilter und implementiere diese oben. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "edc12eaa",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Hinweis:** Der Filter hat normalerweise nur positive Einträge (falls negativ muss man sich überlegen, wie das interpretiert werden soll) und diese Einträge sollen in Summe 1 Ergeben, sodass die (Pixel)Werte des Outputs im gleichen Bereich bleiben."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4dac2efe",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **Übung:** Was ändert sich beim Output neben den eigentlichen Pixel Werten noch? *Tipp:* Betrachte nochmal die genaue Berechnung im vorigen Beispiel, welches in vielen Schritten detailiert gezeigt wurde."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4ce7aa1e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wie wir bereits bemerkt haben, ist die Output Matrix etwas kleiner als der Input. Die genaue Größe kann folgendermaßen berechnet werden:\n",
|
||||
"\n",
|
||||
"$$\\begin{align*}\n",
|
||||
" X_{\\text{new}} &= \\left\\lfloor \\frac{X-K_x+2P_x}{S_x} + 1 \\right\\rfloor \\\\\n",
|
||||
" Y_{\\text{new}} &= \\left\\lfloor \\frac{Y-K_y+2P_y}{S_y} + 1 \\right\\rfloor\n",
|
||||
"\\end{align*}$$"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "54cb3aba",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Dabei ist:\n",
|
||||
"\n",
|
||||
"* $X_{\\text{new}}, Y_{\\text{new}}$ die Größe der neuen Matrix\n",
|
||||
"* $K_x$, $K_y$ die Größe des Kernels ($K_x$ ist die Anzahl der Spalten)\n",
|
||||
"* $P_x, P_y$ steht für das Padding. Dieser Parameter erlaubt uns, einen gleich großen Output wie vorher der Input zu haben. Er beschreibt wie viele Pixel wir rund um unsere Matrix noch hinzufügen. Sprich $P_x=1$ heißt, wir fügen an der linken und an der rechten Seite noch eine Spalte hinzu. Als Wert wird dabei normalerweise die $0$ verwendet (\"**Zero-Padding**\"), es gibt aber auch andere Möglichkeiten.\n",
|
||||
"* $S_x$, $S_y$ steht für den Stride. Dieser steht für die Anzahl an Pixel, die wir jedes mal nach $x$ bzw. nach $y$ \"rutschen\". Standard ist $1$, also wir bewegen uns immer nur 1 Pixel."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9d043f88",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Sehr empfehlenswert ist hier dieses [GitHub Repository](https://github.com/vdumoulin/conv_arithmetic), da es sehr viele Visualisierungen beinhaltet zu den einzelnen Convolution Typen."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "05233ef0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Visualisierung Stride=3**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8c53d073",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Convolution_Stride3_1.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ed3967e3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Convolution_Stride3_2.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "90b7012e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Convolution_Stride3_3.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1ff2b1c2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Convolution_Stride3_4.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e9872f85",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Convolution_Stride3_5.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4b24fa63",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Visualisierung Padding=1**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0f9ad91a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Convolution_Zero_Padding_1.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cc9bedc9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Convolution_Zero_Padding_2.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "80d1cf5b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Der CNN-Layer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4b6f59df",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Nachdem wir nun Bescheid wissen, wie die Faltung auf Bildern allgemein funktioniert, betrachten wir nun das CNN-Layer in PyTorch."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a623b878",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Prinzipiell ist es immer gut, einen Blick in die Dokumentation zu werfen. Deswegen werden wir zuerst [hier](https://docs.pytorch.org/docs/stable/generated/torch.nn.Conv2d.html) kurz schauen. Wir betrachten dabei das `nn.Conv2d()`-Layer, da wir es hauptsächlich auf Bildern (2D) anwenden werden."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5b0d46c4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"`class torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, ..., padding_mode='zeros')`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9a6ab52b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Dies ist der (wichtige Teil vom) Konstruktor des `Conv2d` Layers. Wir gehen nun die Parameter durch:\n",
|
||||
"\n",
|
||||
"* `in_channels`: Die Anzahl der Input Channels. Für ein RGB Bild ist dies 3.\n",
|
||||
"* `out_channels`: Anzahl der Kernels, die wir durch das Netzwerk schicken. Dies ist dann die Anzahl der **Output Channels** und somit auch gleich `in_channels`, falls danach eine weitere `nn.Conv2d` Schicht kommt.\n",
|
||||
"* `kernel_size`: Größe des Kernels (in Tupelform, also $(3,5)$ oder in Integerform, also $3$ für einen quadratischen Kernel (in diesem Fall $3\\times 3$))\n",
|
||||
"* `stride`: Größe des Strides (erneut entweder Tupelform oder Integer bei gleichem Stride)\n",
|
||||
"* `padding`: Größe des Paddings (Tupel oder Integer)\n",
|
||||
"* `padding_mode`: Bestimmt die Art des Paddings. Übergeben wird ein String: 'zeros', 'reflect', 'replicate' oder 'circular'. Standard ist 'zeros'. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "429f1226",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **Übung:** Was sind nun die Parameter die gelernt werden sollen vom Netzwerk?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d15c7721",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wir wollen also die Werte des Filters (Kernel) lernen."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4d73d1e2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **Übung:** Hat ein CNN mehr oder weniger Parameter als ein Fully-Connected Neural Network?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1c80ff0d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wir visualisieren nochmal kurz die Parameter des Netzwerkes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9d23fb13",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/CNN_Input_1.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ab74f54c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/CNN_Input_2.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4563f40a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wir sehen also, dass zum Beispiel für einen RGB Input der Kernel dann auch einfach 3 Channels hat (3 **verschiedene** Matrizen). Dabei wird jeder Layer des Kernels auf das entsprechende Layer im Bild angewendet. Am Ende werden alle Schichten zu **einem Channel** zusammen **addiert**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "05ecc94a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Beim zweiten Bild ist nun die Anzahl an `out_channels` auf 2 gesetzt. Das heißt wir haben 2 **verschiedene** Kernels mit je 3 Channels. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "46485235",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Was ist nun das Ziel von den Kernels?**\n",
|
||||
"\n",
|
||||
"Die Kernels versuchen im Lernprozess Werte in die einzelnen Einträge zu schreiben, sodass wir aus unseren Trainingsbildern Schritt für Schritt möglichst gute Informationen extrahieren können."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ca3e12ed",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Prinzipiell haben wir dadurch schon verstanden wie ein CNN-Layer in PyTorch funktioniert. Was gibt es jetzt noch zu beachten?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e836a6a5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"* Wir müssen nach jedem CNN-Layer berechnen, wie groß unser Ergebnis-Bild ist. Der Grund ist, dass wir nach unseren CNN-Layers eine fixe Output Größe brauchen (siehe nächster Punkt).\n",
|
||||
"* Der Output von einem Neuronalen Netzwerk, welches CNN-Schichten beinhaltet ist vielfältig:\n",
|
||||
" * In den meisten Fällen wechseln wir nach einigen CNN-Layern zu einem Fully-Connected Neuronal Network, welches später die Klassifikation bzw. Regression basierend auf den Ergebnissen der Faltung(en) erledigt. Dabei müssen wir wissen, wie groß der Output nach dem letzten CNN-Layer ist. Ist dieser dann zum Beispiel ein $3\\times 3$-Bild, dann würden wir dieses `flatten()` und erhalten einen $9$-Vektor.\n",
|
||||
" * Es gibt aber auch Fälle (zum Beispiel bei unserer Image Inpainting Challenge), bei der wir als Output wirklich Bilder wollen, sprich wir beenden unser Netzwerk auch mit einem CNN-Layer. Auch hier müssen wir sicherstellen, dass dieser Output dann zum Beispiel genauso groß ist wie der Input."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3a7a1438",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **Übung:** Wie können wir das mit CNN's realisieren, dass am Ende unser Bild genauso groß ist wie vorher?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9ceb3b52",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Am Schluss wollen wir uns noch einem weiteren, sehr ähnlichen, Konzept widmen. Die Rede ist von den sogenannten **Pooling-Layers**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a82b2e59",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Pooling"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "369e8557",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Die Idee vom **Pooling** ist, dass wir unser Bild *downsamplen*, sprich ein kleineres Bild produzieren. Dabei gibt es mehrere Möglichkeiten, wie wir dieses Downsampling realisieren können:\n",
|
||||
"* **Average-Pooling:** Wir nehmen den Mittelwert von $k\\times k$ Werten\n",
|
||||
"* **Max Pooling:** Wir nehmen den Maximalwert von $k\\times k$ Werten (wird meistens verwendet)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "41d13d86",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wir visualisieren kurz den Effekt von Max-Pooling, in diesem Fall mit $k=2$. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5278cd51",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Max_Pooling_1.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4a898651",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Max_Pooling_2.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f460b87e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Max_Pooling_3.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fda20b59",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Max_Pooling_4.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a74487e7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"../resources/Max_Pooling_5.jpg\" width=\"1000\"/>\n",
|
||||
"\n",
|
||||
"(von Dr. Andreas Schörgenhumer; Hands-On AI1 WS2021)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0fef4fb6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Wichtig:** Ein Pooling Layer beinhaltet also keine Parameter, er reduziert nur unsere Datengröße deutlich und führt somit auch natürlich zu einem (großen) Informationsverlust."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "583d3484",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In PyTorch können wir so ein Verhalten auch ganz einfach mit einem Pooling Layer erreichen, welches wir mit `nn.MaxPool2d()` ganz einfach hinzufügen können. Die Dokumentation finden wir [hier](https://docs.pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6fd908f0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Betrachten wir kurz (den wichtigsten Teil davon) den Konstruktor.\n",
|
||||
"\n",
|
||||
"`class torch.nn.MaxPool2d(kernel_size, stride=None, padding=0, ...)`\n",
|
||||
"\n",
|
||||
"Der einzige Wert im Konstruktor, der übergeben werden muss ist `kernel_size`, erneut als Tupel oder Integer. Als `stride` ist standardmäßig `None` übergeben, was bedeutet, dass wir pro Schritt den \"Filter\" um die `kernel_size` verschieben. Ebenso ist kein standardmäßig kein Padding vorgesehen."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "04818d04",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Hinweis:** Die Größe der Daten kann nach einem Pooling Layer mit der gleichen Formel wie vorher berechnet werden. In vielen Fällen (`stride=kernel_size` und ohne Padding) ist das (bis auf das Verhalten am Rand) die Berechnung natürlich ohne Formel auch sehr leicht."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f97b5dbe",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "480c0c1d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Damit haben wir die Grundlagen eines CNN-Layers verstanden. Wir werden im nächsten Notebook ein großes Beispiel machen, welches uns die Details und die Funktionen in einem praktischen Setting nochmal näher bringt."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "dsai",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
BIN
06_NN/models/best_model_cnn_fashion_mnist.pt
Normal file
BIN
06_NN/models/best_model_cnn_mnist.pt
Normal file
BIN
06_NN/models/best_model_sophisticated_cnn_fashion_mnist.pt
Normal file
BIN
06_NN/models/nn_6_simple_regressor.onnx
Normal file
BIN
06_NN/models/nn_6_simple_regressor_scripted.pt
Normal file
BIN
06_NN/models/nn_6_simple_regressor_state_dict.pth
Normal file
BIN
06_NN/resources/Biological_Neuron_vs_Artificial_Neuron.png
Normal file
|
After Width: | Height: | Size: 140 KiB |
BIN
06_NN/resources/CNN_Input_1 (1).jpg
Normal file
|
After Width: | Height: | Size: 73 KiB |
BIN
06_NN/resources/CNN_Input_1.jpg
Normal file
|
After Width: | Height: | Size: 73 KiB |
BIN
06_NN/resources/CNN_Input_2 (1).jpg
Normal file
|
After Width: | Height: | Size: 76 KiB |
BIN
06_NN/resources/CNN_Input_2.jpg
Normal file
|
After Width: | Height: | Size: 76 KiB |
BIN
06_NN/resources/Chemnitz_Hauptplatz (1).jpg
Normal file
|
After Width: | Height: | Size: 829 KiB |
BIN
06_NN/resources/Chemnitz_Hauptplatz.jpg
Normal file
|
After Width: | Height: | Size: 829 KiB |
BIN
06_NN/resources/Cifar10 (1).jpg
Normal file
|
After Width: | Height: | Size: 105 KiB |
BIN
06_NN/resources/Cifar10.jpg
Normal file
|
After Width: | Height: | Size: 105 KiB |
BIN
06_NN/resources/Columbus (1).jpeg
Normal file
|
After Width: | Height: | Size: 163 KiB |
BIN
06_NN/resources/Columbus.jpeg
Normal file
|
After Width: | Height: | Size: 163 KiB |
BIN
06_NN/resources/Columbus_Tensor.png
Normal file
|
After Width: | Height: | Size: 496 KiB |
BIN
06_NN/resources/Convolution_Concept1 (1).jpg
Normal file
|
After Width: | Height: | Size: 53 KiB |
BIN
06_NN/resources/Convolution_Concept1.jpg
Normal file
|
After Width: | Height: | Size: 53 KiB |
BIN
06_NN/resources/Convolution_Concept2 (1).jpg
Normal file
|
After Width: | Height: | Size: 55 KiB |
BIN
06_NN/resources/Convolution_Concept2.jpg
Normal file
|
After Width: | Height: | Size: 55 KiB |
BIN
06_NN/resources/Convolution_Concept3 (1).jpg
Normal file
|
After Width: | Height: | Size: 56 KiB |
BIN
06_NN/resources/Convolution_Concept3.jpg
Normal file
|
After Width: | Height: | Size: 56 KiB |
BIN
06_NN/resources/Convolution_Concept4 (1).jpg
Normal file
|
After Width: | Height: | Size: 59 KiB |
BIN
06_NN/resources/Convolution_Concept4.jpg
Normal file
|
After Width: | Height: | Size: 59 KiB |
BIN
06_NN/resources/Convolution_Concept5 (1).jpg
Normal file
|
After Width: | Height: | Size: 60 KiB |
BIN
06_NN/resources/Convolution_Concept5.jpg
Normal file
|
After Width: | Height: | Size: 60 KiB |
BIN
06_NN/resources/Convolution_Concept6 (1).jpg
Normal file
|
After Width: | Height: | Size: 59 KiB |
BIN
06_NN/resources/Convolution_Concept6.jpg
Normal file
|
After Width: | Height: | Size: 59 KiB |
BIN
06_NN/resources/Convolution_Concept7 (1).jpg
Normal file
|
After Width: | Height: | Size: 62 KiB |
BIN
06_NN/resources/Convolution_Concept7.jpg
Normal file
|
After Width: | Height: | Size: 62 KiB |
BIN
06_NN/resources/Convolution_Stride3_1 (1).jpg
Normal file
|
After Width: | Height: | Size: 54 KiB |
BIN
06_NN/resources/Convolution_Stride3_1.jpg
Normal file
|
After Width: | Height: | Size: 54 KiB |
BIN
06_NN/resources/Convolution_Stride3_2 (1).jpg
Normal file
|
After Width: | Height: | Size: 56 KiB |
BIN
06_NN/resources/Convolution_Stride3_2.jpg
Normal file
|
After Width: | Height: | Size: 56 KiB |
BIN
06_NN/resources/Convolution_Stride3_3 (1).jpg
Normal file
|
After Width: | Height: | Size: 60 KiB |
BIN
06_NN/resources/Convolution_Stride3_3.jpg
Normal file
|
After Width: | Height: | Size: 60 KiB |
BIN
06_NN/resources/Convolution_Stride3_4 (1).jpg
Normal file
|
After Width: | Height: | Size: 60 KiB |
BIN
06_NN/resources/Convolution_Stride3_4.jpg
Normal file
|
After Width: | Height: | Size: 60 KiB |
BIN
06_NN/resources/Convolution_Stride3_5 (1).jpg
Normal file
|
After Width: | Height: | Size: 52 KiB |
BIN
06_NN/resources/Convolution_Stride3_5.jpg
Normal file
|
After Width: | Height: | Size: 52 KiB |
BIN
06_NN/resources/Convolution_Zero_Padding_1 (1).jpg
Normal file
|
After Width: | Height: | Size: 42 KiB |
BIN
06_NN/resources/Convolution_Zero_Padding_1.jpg
Normal file
|
After Width: | Height: | Size: 42 KiB |
BIN
06_NN/resources/Convolution_Zero_Padding_2 (1).jpg
Normal file
|
After Width: | Height: | Size: 69 KiB |
BIN
06_NN/resources/Convolution_Zero_Padding_2.jpg
Normal file
|
After Width: | Height: | Size: 69 KiB |
BIN
06_NN/resources/Dropout_Visualized (1).png
Normal file
|
After Width: | Height: | Size: 67 KiB |
BIN
06_NN/resources/Dropout_Visualized.png
Normal file
|
After Width: | Height: | Size: 67 KiB |
BIN
06_NN/resources/Gradient_Descent_Intiution_Derivative.png
Normal file
|
After Width: | Height: | Size: 56 KiB |
BIN
06_NN/resources/Gradient_Descent_Local_Minima.png
Normal file
|
After Width: | Height: | Size: 15 KiB |
BIN
06_NN/resources/Gradient_Descent_Selfmade.png
Normal file
|
After Width: | Height: | Size: 2.2 MiB |
BIN
06_NN/resources/Gradient_Descent_Too_Big_LR.png
Normal file
|
After Width: | Height: | Size: 53 KiB |
BIN
06_NN/resources/Gradient_Descent_Too_Small_LR.png
Normal file
|
After Width: | Height: | Size: 52 KiB |
BIN
06_NN/resources/Instagram_Reel_First_Try_Suspicious.mp4
Normal file
BIN
06_NN/resources/Instagram_Reel_NaNs.mp4
Normal file
BIN
06_NN/resources/Kirby_Convolution (1).jpg
Normal file
|
After Width: | Height: | Size: 566 KiB |
BIN
06_NN/resources/Kirby_Convolution.jpg
Normal file
|
After Width: | Height: | Size: 566 KiB |
BIN
06_NN/resources/Loss_Landscape.png
Normal file
|
After Width: | Height: | Size: 1.6 MiB |
BIN
06_NN/resources/Loss_Landscape_Path.jpeg
Normal file
|
After Width: | Height: | Size: 183 KiB |
BIN
06_NN/resources/MNIST.png
Normal file
|
After Width: | Height: | Size: 69 KiB |
BIN
06_NN/resources/Mario_Convolution (1).png
Normal file
|
After Width: | Height: | Size: 230 KiB |
BIN
06_NN/resources/Mario_Convolution.png
Normal file
|
After Width: | Height: | Size: 230 KiB |
BIN
06_NN/resources/Max_Pooling_1 (1).jpg
Normal file
|
After Width: | Height: | Size: 42 KiB |
BIN
06_NN/resources/Max_Pooling_1.jpg
Normal file
|
After Width: | Height: | Size: 42 KiB |
BIN
06_NN/resources/Max_Pooling_2 (1).jpg
Normal file
|
After Width: | Height: | Size: 45 KiB |
BIN
06_NN/resources/Max_Pooling_2.jpg
Normal file
|
After Width: | Height: | Size: 45 KiB |
BIN
06_NN/resources/Max_Pooling_3 (1).jpg
Normal file
|
After Width: | Height: | Size: 48 KiB |
BIN
06_NN/resources/Max_Pooling_3.jpg
Normal file
|
After Width: | Height: | Size: 48 KiB |
BIN
06_NN/resources/Max_Pooling_4 (1).jpg
Normal file
|
After Width: | Height: | Size: 49 KiB |
BIN
06_NN/resources/Max_Pooling_4.jpg
Normal file
|
After Width: | Height: | Size: 49 KiB |
BIN
06_NN/resources/Max_Pooling_5 (1).jpg
Normal file
|
After Width: | Height: | Size: 50 KiB |
BIN
06_NN/resources/Max_Pooling_5.jpg
Normal file
|
After Width: | Height: | Size: 50 KiB |
BIN
06_NN/resources/NN_Function_Approximation.png
Normal file
|
After Width: | Height: | Size: 33 KiB |
BIN
06_NN/resources/Overfitting_Underfitting_Loss_Curve (1).png
Normal file
|
After Width: | Height: | Size: 36 KiB |
BIN
06_NN/resources/Overfitting_Underfitting_Loss_Curve.png
Normal file
|
After Width: | Height: | Size: 36 KiB |
BIN
06_NN/resources/Perzeptron.png
Normal file
|
After Width: | Height: | Size: 18 KiB |
BIN
06_NN/resources/Perzeptron_zugeschnitten.png
Normal file
|
After Width: | Height: | Size: 13 KiB |
BIN
06_NN/resources/SGD_Local_Minima_Medal.jpg
Normal file
|
After Width: | Height: | Size: 76 KiB |
BIN
06_NN/resources/SGD_vs_GD.png
Normal file
|
After Width: | Height: | Size: 111 KiB |
BIN
06_NN/resources/Sigmoid_plus_Derivative.png
Normal file
|
After Width: | Height: | Size: 29 KiB |
BIN
06_NN/resources/Skip_Connections (1).png
Normal file
|
After Width: | Height: | Size: 8.7 KiB |
BIN
06_NN/resources/Skip_Connections.png
Normal file
|
After Width: | Height: | Size: 8.7 KiB |
BIN
06_NN/resources/Tensor_1.png
Normal file
|
After Width: | Height: | Size: 10 KiB |
BIN
06_NN/resources/Tensor_2.png
Normal file
|
After Width: | Height: | Size: 210 KiB |
BIN
06_NN/resources/Tensors_Everywhere.jpg
Normal file
|
After Width: | Height: | Size: 41 KiB |
BIN
06_NN/resources/Vectored.png
Normal file
|
After Width: | Height: | Size: 282 KiB |