Используя как основу статью Валида Абдулы https://medium.com/@waleedka/traffic-sign-recognition-with-tensorflow-629dffc391a6
Попробуем сделать рабочую программу распознавания дорожных знаков. Сначала бельгийских, затем нашенских. Ссылка на скачивание российского датасета в предыдущих постах… Бельгийский датасет в примерах.
С учетом, что прога писалась для 0.12 версии Tensorflow, ее надо немного модифицировать и поставить необходимые библиотеки.
Прога для Python2.7 и tensorflow 1.1 ( в тексте есть все ремарки для понимания)
from __future__ import division from tensorflow.python.framework import graph_util from tensorflow.python.platform import gfile import os import random import skimage.data import skimage.transform import matplotlib import matplotlib.pyplot as plt import numpy as np import tensorflow as tf def load_data(data_dir): """Loads a data set and returns two lists: images: a list of Numpy arrays, each representing an image. labels: a list of numbers that represent the images labels. """ # Get all subdirectories of data_dir. Each represents a label. directories = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))] # Loop through the label directories and collect the data in # two lists, labels and images. labels = [] images = [] for d in directories: label_dir = os.path.join(data_dir, d) file_names = [os.path.join(label_dir, f) for f in os.listdir(label_dir) if f.endswith(".jpg")] # For each label, load it's images and add them to the images list. # And add the label number (i.e. directory name) to the labels list. for f in file_names: images.append(skimage.data.imread(f)) labels.append(int(d)) return images, labels def display_images_and_labels(images, labels): """Display the first image of each label.""" unique_labels = set(labels) plt.figure(figsize=(15, 15)) i = 1 for label in unique_labels: if i<=64: #restriction from matplotlib # Pick the first image for each label. image = images[labels.index(label)] plt.subplot(8, 8, i) # A grid of 8 rows x 8 columns plt.axis('off') plt.title("Label {0} ({1})".format(label, labels.count(label))) _ = plt.imshow(image) i += 1 plt.show() def display_label_images(images, label): """Display images of a specific label.""" limit = 24 # show a max of 24 images plt.figure(figsize=(15, 5)) i = 1 start = labels.index(label) end = start + labels.count(label) for image in images[start:end][:limit]: plt.subplot(3, 8, i) # 3 rows, 8 per row plt.axis('off') i += 1 plt.imshow(image) plt.show() # Load training and testing datasets. ROOT_PATH = "/home/tensorflow/python_prog/traffic_ru" train_data_dir = os.path.join(ROOT_PATH, "Training") test_data_dir = os.path.join(ROOT_PATH, "Testing") output_graph = os.path.join(ROOT_PATH, "traffic_ru.chkp") output_labels = os.path.join(ROOT_PATH, "traffic_ru_lbl.txt") images, labels = load_data(train_data_dir) unique_labels_set = set(labels) unique_labels_set_string = [] for label in unique_labels_set: unique_labels_set_string.append(str(label)) print("Unique Labels: {0}\nTotal Images: {1}".format(len(set(labels)), len(images))) #display_images_and_labels(images, labels) #display_label_images(images, 32) for image in images[:5]: print("shape: {0}, min: {1}, max: {2}".format(image.shape, image.min(), image.max())) # Resize images images32 = [skimage.transform.resize(image, (32, 32)) for image in images] #display_images_and_labels(images32, labels) for image in images32[:5]: print("shape: {0}, min: {1}, max: {2}".format(image.shape, image.min(), image.max())) labels_a = np.array(labels) images_a = np.array(images32) print("labels: ", labels_a.shape, "\nimages: ", images_a.shape) # Create a graph to hold the model. graph = tf.Graph() # Create model in the graph. with graph.as_default(): # Placeholders for inputs and labels. #images_ph = tf.placeholder(tf.float32, [None, 32, 32, 3], name = "image_jpeg") images_ph = tf.placeholder(tf.float32, [None, 32, 32, 3]) labels_ph = tf.placeholder(tf.int32, [None]) # Flatten input from: [None, height, width, channels] # To: [None, height * width * channels] == [None, 3072] images_flat = tf.contrib.layers.flatten(images_ph) # Fully connected layer. # Generates logits of size [None, 68] logits = tf.contrib.layers.fully_connected(images_flat, 68, tf.nn.relu) # Convert logits to label indexes (int). # Shape [None], which is a 1D vector of length == batch_size. #predicted_labels = tf.argmax(logits, 1, name = "predicted") predicted_labels = tf.argmax(logits, 1) # Define the loss function. # Cross-entropy is a good choice for classification. # !!!!!!!!!!!!!!!!!!!!!!!!!!!!! changed !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels = labels_ph), name = "loss_func") loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels = labels_ph)) # Create training op. #train = tf.train.AdamOptimizer(learning_rate=0.001, epsilon=1.0).minimize(loss) #train = tf.train.AdamOptimizer(learning_rate=0.001, epsilon=0.00000001, name = "train_func").minimize(loss) train = tf.train.AdamOptimizer(learning_rate=0.001, epsilon=0.00000001).minimize(loss) # Let's create a Saver object # By default, the Saver handles every Variables related to the default graph all_saver = tf.train.Saver() # And, finally, an initialization op to execute before training. init = tf.global_variables_initializer() print("images_flat: ", images_flat) print("logits: ", logits) print("loss: ", loss) print("predicted_labels: ", predicted_labels) # Create a session to run the graph we created. session = tf.Session(graph=graph) # First step is always to initialize all variables. # We don't care about the return value, though. It's None. _ = session.run([init]) # We can list operations #for op in session.graph.get_operations(): #print(op.name) steps = 401 for i in range(steps): _, loss_value = session.run([train, loss], feed_dict={images_ph: images_a, labels_ph: labels_a}) if i % 10 == 0: print("Loss: ", loss_value) #Save result to graph all_saver.save(session, output_graph, global_step = steps) # Pick 10 random images sample_indexes = random.sample(range(len(images32)), 10) sample_images = [images32[i] for i in sample_indexes] sample_labels = [labels[i] for i in sample_indexes] # Run the "predicted_labels" op. predicted = session.run([predicted_labels], feed_dict={images_ph: sample_images})[0] print(sample_labels) print(predicted) # Display the predictions and the ground truth visually. fig = plt.figure(figsize=(10, 10)) for i in range(len(sample_images)): truth = sample_labels[i] prediction = predicted[i] plt.subplot(5, 2,1+i) plt.axis('off') color='green' if truth == prediction else 'red' plt.text(40, 10, "Truth: {0}\nPrediction: {1}".format(truth, prediction), fontsize=12, color=color) plt.imshow(sample_images[i]) #plt.show() # Load the test dataset. test_images, test_labels = load_data(test_data_dir) # Transform the images, just like we did with the training set. test_images32 = [skimage.transform.resize(image, (32, 32)) for image in test_images] #display_images_and_labels(test_images32, test_labels) # Run predictions against the full test set. predicted = session.run([predicted_labels], feed_dict={images_ph: test_images32})[0] #print(test_labels) #print(predicted) # Calculate how many matches we got. match_count = sum([int(y == y_) for y, y_ in zip(test_labels, predicted)]) print(match_count) print(len(test_labels)) accuracy = match_count/len(test_labels) print(accuracy) # Write out the trained graph and labels with the weights stored as # constants. #output_graph_def = graph_util.convert_variables_to_constants(session, graph.as_graph_def(), ["predicted"]) #with gfile.FastGFile(output_graph, 'wb') as f: #f.write(output_graph_def.SerializeToString()) with gfile.FastGFile(output_labels, 'w') as f: f.write('\n'.join(unique_labels_set_string) + '\n') # Close the session. This will destroy the trained model. session.close()
На бельгийском датасете из 62 типов знаков и 4600 примерно изображениях точность распознавания составила 0,64 в цикле из 401 эпох. причем увеличение количества эпох не значительно увеличивала процент распознавания.
На российском датасете (67 типов знаков и 39000 картинок ) сразу возникала ошибка «nan» при попытке потренировать нейросеть. Опытным путем выяснилось, что в данном месте надо поменять количество категорий (папок с типами знаков), на +1 от необходимого. ( как оказалось позже, для тренировки использовались папки с классами знаков пронумерованные от 1 до 67. После переименования папок от 0 до 66 (67 категорий – заработал вариант с цифрой 67). Век живи – век учись.
# Fully connected layer.
# Generates logits of size [None, 68] – was 62 in belgium example
logits = tf.contrib.layers.fully_connected(images_flat, 68, tf.nn.relu)
в поиске столь простого решения пришлось поиграться learning rate и epsilon (так говорили гуру в различных текстах… ) в этом месте:
# Create training op.
#train = tf.train.AdamOptimizer(learning_rate=0.001, epsilon=1.0).minimize(loss)
train = tf.train.AdamOptimizer(learning_rate=0.001, epsilon=0.00000001).minimize(loss)
B документации тензорфлоу говорится, что для больших датасетов epsilon может быть в диапазоне 1.0 – 0.1 – поставил стандартную. …
В любом случае – датасет с этими знаками очень нехорош… надо будет поискать более качественные фото или почистить данный датасет – распознавание не лучше 60%…
создав тестовую директорию с изображениями, которых не было в тренировочной базе и раскомментировав последний кусок программы, провел обучение на 400 шагах… Тренировка идет около 5 минут (i3 2,7Ghz 6Gb Geforce 720 2Gb).
53 процента…
еще повтор обучения.. 8 из 10 на учебном датасете и те же 53 на тестируемом датасете.
Вывод – надо поработать с датасетами…
update + 3 дня:
использовал стандартную базу из 25000 картинок ( без модицикации) – 401 шаг. распознавание улучшилось до 63 процентов.
такое впечатление, что иногда тренировка идет не очень а иногда – отлично….
по дороге сделал сохранение натренированного графа в protobuf (.Pb) файл, чтобы распознавание можно было использовать без обязательной предварительной тренировки.
программа для преобразования полученного chkp файла в pb:
""" program to prepare from standart saved model protobuf file """ from tensorflow.python.framework import graph_util import tensorflow as tf model_folder = '/home/tensorflow/python_prog/traffic_ru/' # We retrieve our checkpoint fullpath checkpoint = tf.train.get_checkpoint_state(model_folder) input_checkpoint = checkpoint.model_checkpoint_path # We precise the file fullname of our freezed graph absolute_model_folder = "/".join(input_checkpoint.split('/')[:-1]) output_graph = absolute_model_folder + "/traffic_ru.pb" # Before exporting our graph, we need to precise what is our output node # This is how TF decides what part of the Graph he has to keep and what part it can dump # NOTE: this variable is plural, because you can have multiple output nodes output_node_names = "Placeholder,Placeholder_1,ArgMax" # We clear devices to allow TensorFlow to control on which device it will load operations clear_devices = True # We import the meta graph and retrieve a Saver saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices) # We retrieve the protobuf graph definition graph = tf.get_default_graph() input_graph_def = graph.as_graph_def() # We start a session and restore the graph weights with tf.Session() as sess: saver.restore(sess, input_checkpoint) # We use a built-in TF helper to export variables to constants output_graph_def = graph_util.convert_variables_to_constants( sess, # The session is used to retrieve the weights input_graph_def, # The graph_def is used to retrieve the nodes output_node_names.split(",") # The output node names are used to select the usefull nodes ) # Finally we serialize and dump the output graph to the filesystem with tf.gfile.GFile(output_graph, "wb") as f: f.write(output_graph_def.SerializeToString()) print("%d ops in the final graph." % len(output_graph_def.node))
Ну и собственно программа для тестирования распознавания одиночного файла
""" program to predict single jpg file of traffic sign model training we can do with traffic_ru.py program and protobuf files (*.pb) prepared by traffic_ru_make_pb.py program Andrey Surkov 2017 """ import numpy as np import tensorflow as tf import skimage.data import skimage.transform from numpy import array imagePath = '/home/tensorflow/python_prog/test2.jpg' modelFullPath = '/home/tensorflow/python_prog/traffic_ru/traffic_ru.pb' labelsFullPath = '/home/tensorflow/python_prog/traffic_ru/traffic_ru_lbl.txt' def create_graph(): """Creates a graph from saved GraphDef file and returns a saver.""" # Creates graph from saved *.pb. with tf.gfile.FastGFile(modelFullPath, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) _ = tf.import_graph_def(graph_def, name='') def run_predict_image(): answer = None # checking if jpg file existed if not tf.gfile.Exists(imagePath): tf.logging.fatal('File does not exist %s', imagePath) return answer #reading image data image_data = skimage.data.imread(imagePath) # Resize images image32 = skimage.transform.resize(image_data, (32, 32)) #print("shape: {0}, min: {1}, max: {2}".format(image32.shape, image32.min(), image32.max())) #image_a = np.array(image32) #transferring image to array from one element because placeholder recieves array of jpeg #link from graph = > images_ph = tf.placeholder(tf.float32, [None, 32, 32, 3]) image_a = array(image32).reshape(1,32,32,3) # Creates graph from saved GraphDef. create_graph() with tf.Session() as sess: # We can list operations #for op in sess.graph.get_operations(): #print(op.name) #argmax tensor we use because out graph so this prediction with this tensor # from graph => predicted_labels = tf.argmax(logits, 1) # default name is ArgMax we can check it if we list all operations argmax_tensor = sess.graph.get_tensor_by_name('ArgMax:0') # this is placeholder from graph where we can put our jpeg for prediction # from graph => images_ph = tf.placeholder(tf.float32, [None, 32, 32, 3]) # default name is Placeholder image_ph = sess.graph.get_tensor_by_name('Placeholder:0') # make predictions predictions = sess.run(argmax_tensor, feed_dict={image_ph: image_a})[0] # if we predict only one jpg file this is single number predictions = np.squeeze(predictions) #print(predictions) f = open(labelsFullPath, 'rb') lines = f.readlines() labels = [str(w).replace("\n", "") for w in lines] answer = labels[predictions] return answer if __name__ == '__main__': print(run_predict_image())
Все программы достаточно полно откомментированы для их понимания.
как выглядят директории: