diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1287439 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.pyc +*/*.pyc +*/*/*/.pyc diff --git a/demo.py b/demo.py index bbf8179..3551baa 100644 --- a/demo.py +++ b/demo.py @@ -1,3 +1,5 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- import sys sys.path.append('./') @@ -7,18 +9,32 @@ import cv2 import numpy as np -classes_name = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train","tvmonitor"] +classes_name = ["aeroplane", "bicycle", "bird", "boat", "bottle", + "bus", "car", "cat", "chair", "cow", + "diningtable", "dog", "horse", "motorbike", "person", + "pottedplant", "sheep", "sofa", "train","tvmonitor"] +common_params = { 'image_size': 448, + 'num_classes': 20, + 'batch_size':1} + +net_params = {'cell_size': 7, + 'boxes_per_cell':2, + 'weight_decay': 0.0005} def process_predicts(predicts): + """ + 对于规范化的输出结果对于特定的用户可能觉得不习惯,那么实现一个接口,将规范化 + 的结果重新编写为用户习惯的数据类型 + """ p_classes = predicts[0, :, :, 0:20] C = predicts[0, :, :, 20:22] coordinate = predicts[0, :, :, 22:] - + # 训练的模型设置超参数 net_params, 其中cell大小设置为7 p_classes = np.reshape(p_classes, (7, 7, 1, 20)) C = np.reshape(C, (7, 7, 2, 1)) - P = C * p_classes + P = C * p_classes # P size = (7, 7, 2, 20) #print P[5,1, 0, :] @@ -51,36 +67,42 @@ def process_predicts(predicts): return xmin, ymin, xmax, ymax, class_num -common_params = {'image_size': 448, 'num_classes': 20, - 'batch_size':1} -net_params = {'cell_size': 7, 'boxes_per_cell':2, 'weight_decay': 0.0005} - -net = YoloTinyNet(common_params, net_params, test=True) - -image = tf.placeholder(tf.float32, (1, 448, 448, 3)) -predicts = net.inference(image) +def main(): -sess = tf.Session() -np_img = cv2.imread('cat.jpg') -resized_img = cv2.resize(np_img, (448, 448)) -np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB) + net = YoloTinyNet(common_params, net_params, test=True) + # tensorflow中声明占位符号image, 这在后面run的时候 + # feed_dict中会出现该占位符和对应的值,意思就是输入数据的来源 + image = tf.placeholder(tf.float32, (1, 448, 448, 3)) + predicts = net.inference(image) + sess = tf.Session() -np_img = np_img.astype(np.float32) + # 转化数据格式 + np_img = cv2.imread('cat.jpg') + resized_img = cv2.resize(np_img, (448, 448)) + np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB) -np_img = np_img / 255.0 * 2 - 1 -np_img = np.reshape(np_img, (1, 448, 448, 3)) + np_img = np_img.astype(np.float32) + #白化输入的数据 + np_img = np_img / 255.0 * 2 - 1 + np_img = np.reshape(np_img, (1, 448, 448, 3)) -saver = tf.train.Saver(net.trainable_collection) + saver = tf.train.Saver(net.trainable_collection) -saver.restore(sess, 'models/pretrain/yolo_tiny.ckpt') + saver.restore(sess, 'models/pretrain/yolo_tiny.ckpt') + # The optional feed_dict argument allows the caller to override + # the value of tensors in the graph. + np_predict = sess.run(predicts, feed_dict={image: np_img}) -np_predict = sess.run(predicts, feed_dict={image: np_img}) + xmin, ymin, xmax, ymax, class_num = process_predicts(np_predict) + class_name = classes_name[class_num] + # 绘制预测框, 输出预测类型 + cv2.rectangle(resized_img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 0, 255)) + cv2.putText(resized_img, + class_name, (int(xmin), int(ymin)), 2, 1.5, (0, 0, 255)) + cv2.imwrite('cat_out.jpg', resized_img) + sess.close() -xmin, ymin, xmax, ymax, class_num = process_predicts(np_predict) -class_name = classes_name[class_num] -cv2.rectangle(resized_img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 0, 255)) -cv2.putText(resized_img, class_name, (int(xmin), int(ymin)), 2, 1.5, (0, 0, 255)) -cv2.imwrite('cat_out.jpg', resized_img) -sess.close() +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/tools/preprocess_pascal_voc.py b/tools/preprocess_pascal_voc.py index 76d43fc..fee497f 100755 --- a/tools/preprocess_pascal_voc.py +++ b/tools/preprocess_pascal_voc.py @@ -1,3 +1,5 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- """preprocess pascal_voc data """ import os diff --git a/tools/train.py b/tools/train.py index 5b399ba..572c67b 100644 --- a/tools/train.py +++ b/tools/train.py @@ -1,3 +1,5 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- import sys from optparse import OptionParser diff --git a/yolo/dataset/dataset.py b/yolo/dataset/dataset.py index de56381..7bd8dd9 100644 --- a/yolo/dataset/dataset.py +++ b/yolo/dataset/dataset.py @@ -1,3 +1,6 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + """DataSet base class """ class DataSet(object): diff --git a/yolo/dataset/text_dataset.py b/yolo/dataset/text_dataset.py index 15c2b77..d4e886b 100644 --- a/yolo/dataset/text_dataset.py +++ b/yolo/dataset/text_dataset.py @@ -1,3 +1,5 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -12,11 +14,13 @@ from yolo.dataset.dataset import DataSet + class TextDataSet(DataSet): """TextDataSet process text input file dataset text file format: image_path xmin1 ymin1 xmax1 ymax1 class1 xmin2 ymin2 xmax2 ymax2 class2 + 设计思想是采用生产者消费者模式, """ def __init__(self, common_params, dataset_params): @@ -52,11 +56,12 @@ def __init__(self, common_params, dataset_params): self.record_number = len(self.record_list) self.num_batch_per_epoch = int(self.record_number / self.batch_size) - + # 创建生产者守护进程并启动 t_record_producer = Thread(target=self.record_producer) t_record_producer.daemon = True t_record_producer.start() - + + # 创建thread_num个消费者守护进程并启动 for i in range(self.thread_num): t = Thread(target=self.record_customer) t.daemon = True @@ -109,6 +114,12 @@ def record_process(self, record): labels[object_num] = [xcenter, ycenter, box_w, box_h, class_num] object_num += 1 i += 5 + # TODO: + # 这个地方会不会忽略掉一些显著特征呢? + # 因为self.max_objects是自定义的变量,在读取的过程中, + # 仅仅读取前面的数据的话,后面的会被忽略掉的。 + # TODO: 训练数据中每张图片的物体是如何给出的,是根据显著性呢还是根据 + # 起始点为位置大小给出的呢,这个需要check一下 if object_num >= self.max_objects: break return [image, labels, object_num] @@ -124,9 +135,9 @@ def record_customer(self): def batch(self): """get batch Returns: - images: 4-D ndarray [batch_size, height, width, 3] - labels: 3-D ndarray [batch_size, max_objects, 5] - objects_num: 1-D ndarray [batch_size] + images: 4-D ndarray [batch_size, height, width, 3] 一个batch中所有图片数据 + labels: 3-D ndarray [batch_size, max_objects, 5] 一个batch中的所有图片的中的所有物体的标签 + objects_num: 1-D ndarray [batch_size] 一个batch中每个图片中object的个数 """ images = [] labels = [] diff --git a/yolo/net/net.py b/yolo/net/net.py index 005d0de..d731004 100644 --- a/yolo/net/net.py +++ b/yolo/net/net.py @@ -1,3 +1,5 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -51,7 +53,6 @@ def _variable_with_weight_decay(self, name, shape, stddev, wd, pretrain=True, tr stddev: standard devision of a truncated Gaussian wd: add L2Loss weight decay multiplied by this float. If None, weight decay is not added for this Variable. - Returns: Variable Tensor """ @@ -74,16 +75,19 @@ def conv2d(self, scope, input, kernel_size, stride=1, pretrain=True, train=True) output: 4-D tensor [batch_size, height/stride, width/stride, out_channels] """ with tf.variable_scope(scope) as scope: + # 初始化权重的kernel kernel = self._variable_with_weight_decay('weights', shape=kernel_size, stddev=5e-2, wd=self.weight_decay, pretrain=pretrain, train=train) conv = tf.nn.conv2d(input, kernel, [1, stride, stride, 1], padding='SAME') + + # biases 初始化采用常数 0.0 初始化 biases = self._variable_on_cpu('biases', kernel_size[3:], tf.constant_initializer(0.0), pretrain, train) - bias = tf.nn.bias_add(conv, biases) - conv1 = self.leaky_relu(bias) + conv1 = tf.nn.bias_add(conv, biases) + output = self.leaky_relu(conv1) - return conv1 + return output def max_pool(self, input, kernel_size, stride): @@ -99,7 +103,7 @@ def max_pool(self, input, kernel_size, stride): return tf.nn.max_pool(input, ksize=[1, kernel_size[0], kernel_size[1], 1], strides=[1, stride, stride, 1], padding='SAME') - def local(self, scope, input, in_dimension, out_dimension, leaky=True, pretrain=True, train=True): + def local(self, scope, _input, in_dimension, out_dimension, leaky=True, pretrain=True, train=True): """Fully connection layer Args: @@ -110,7 +114,7 @@ def local(self, scope, input, in_dimension, out_dimension, leaky=True, pretrain= output: 2-D tensor [batch_size, out_dimension] """ with tf.variable_scope(scope) as scope: - reshape = tf.reshape(input, [tf.shape(input)[0], -1]) + reshape = tf.reshape(_input, [tf.shape(_input)[0], -1]) weights = self._variable_with_weight_decay('weights', shape=[in_dimension, out_dimension], stddev=0.04, wd=self.weight_decay, pretrain=pretrain, train=train) @@ -137,6 +141,9 @@ def leaky_relu(self, x, alpha=0.1, dtype=tf.float32): y : Tensor """ x = tf.cast(x, dtype=dtype) + # 对输入的特征向量进行leaky_relu + # 其中对>0的数据采用直接激活的方式,对小于0的数据采用leaky激活方式 + # 此处实现值得学习和借鉴 bool_mask = (x > 0) mask = tf.cast(bool_mask, dtype=dtype) return 1.0 * mask * x + alpha * (1 - mask) * x @@ -160,4 +167,13 @@ def loss(self, predicts, labels, objects_num): labels : 3-D tensor of [batch_size, max_objects, 5] objects_num: 1-D tensor [batch_size] """ - raise NotImplementedError \ No newline at end of file + raise NotImplementedError + +''' +## weight decay: +在机器学习或者模式识别中,会出现overfitting,而当网络逐渐overfitting时网络 +权值逐渐变大,因此,为了避免出现overfitting,会给误差函数添加一个惩罚项,常用 +的惩罚项是所有权重的平方乘以一个衰减常量之和。其用来惩罚大的权值。 +权值衰减惩罚项使得权值收敛到较小的绝对值,而惩罚大的权值。因为大的权值会使得 +系统出现过拟合,降低其泛化性能。 +''' \ No newline at end of file diff --git a/yolo/net/yolo_net.py b/yolo/net/yolo_net.py index 0dfa034..71abb2c 100644 --- a/yolo/net/yolo_net.py +++ b/yolo/net/yolo_net.py @@ -1,3 +1,5 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -123,6 +125,7 @@ def iou(self, boxes1, boxes2): Return: iou: 3-D tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL] """ + # 计算左上角和右下角的位置信息 boxes1 = tf.pack([boxes1[:, :, :, 0] - boxes1[:, :, :, 2] / 2, boxes1[:, :, :, 1] - boxes1[:, :, :, 3] / 2, boxes1[:, :, :, 0] + boxes1[:, :, :, 2] / 2, boxes1[:, :, :, 1] + boxes1[:, :, :, 3] / 2]) boxes1 = tf.transpose(boxes1, [1, 2, 3, 0]) @@ -134,6 +137,16 @@ def iou(self, boxes1, boxes2): rd = tf.minimum(boxes1[:, :, :, 2:], boxes2[2:]) #intersection + ''' + 0, 0------------------------> + | ————————————| + | | ——————|—————— + | | | | | + | |—————|—————| | + | |____________| + | + v + ''' intersection = rd - lu inter_square = intersection[:, :, :, 0] * intersection[:, :, :, 1] @@ -285,7 +298,17 @@ def loss(self, predicts, labels, objects_num): label = labels[i, :, :] object_num = objects_num[i] nilboy = tf.ones([7,7,2]) - tuple_results = tf.while_loop(self.cond1, self.body1, [tf.constant(0), object_num, [class_loss, object_loss, noobject_loss, coord_loss], predict, label, nilboy]) + tuple_results = tf.while_loop( + self.cond1, + self.body1, + [ + tf.constant(0), + object_num, + [class_loss, object_loss, noobject_loss, coord_loss], + predict, + label, + nilboy + ]) for j in range(4): loss[j] = loss[j] + tuple_results[2][j] nilboy = tuple_results[5] diff --git a/yolo/net/yolo_tiny_net.py b/yolo/net/yolo_tiny_net.py index 6f3c48e..696790a 100644 --- a/yolo/net/yolo_tiny_net.py +++ b/yolo/net/yolo_tiny_net.py @@ -1,3 +1,5 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -39,7 +41,16 @@ def inference(self, images): predicts: 4-D tensor [batch_size, cell_size, cell_size, num_classes + 5 * boxes_per_cell] """ conv_num = 1 - + """ + conv2d(self, scope, input, kernel_size, stride=1, pretrain=True, train=True) + Args: + input: 4-D tensor [batch_size, height, width, depth] + scope: variable_scope name + kernel_size: [k_height, k_width, in_channel, out_channel] + stride: int32 + Return: + output: 4-D tensor [batch_size, height/stride, width/stride, out_channels] + """ temp_conv = self.conv2d('conv' + str(conv_num), images, [3, 3, 3, 16], stride=1) conv_num += 1 @@ -96,7 +107,7 @@ def inference(self, images): scales = tf.reshape(local3[:, n1:n2], (-1, self.cell_size, self.cell_size, self.boxes_per_cell)) boxes = tf.reshape(local3[:, n2:], (-1, self.cell_size, self.cell_size, self.boxes_per_cell * 4)) - local3 = tf.concat([class_probs, scales, boxes], 3) + local3 = tf.concat(3, [class_probs, scales, boxes]) predicts = local3 @@ -117,7 +128,10 @@ def iou(self, boxes1, boxes2): boxes2[0] + boxes2[2] / 2, boxes2[1] + boxes2[3] / 2]) #calculate the left up point + # boxes相当于grandtruth的box,只有一个 + # 但是boxes1相当于ROI,是有很多的 lu = tf.maximum(boxes1[:, :, :, 0:2], boxes2[0:2]) + # calculate the right down point rd = tf.minimum(boxes1[:, :, :, 2:], boxes2[2:]) #intersection @@ -135,13 +149,14 @@ def iou(self, boxes1, boxes2): return inter_square/(square1 + square2 - inter_square + 1e-6) + # loop停止函数和后面的运行函数体的输入参数是一致的,输入相同的参数数据 def cond1(self, num, object_num, loss, predict, label, nilboy): """ if num < object_num """ return num < object_num - + # 运行的函数体的定义 def body1(self, num, object_num, loss, predict, labels, nilboy): """ calculate loss @@ -202,10 +217,10 @@ def body1(self, num, object_num, loss, predict, labels, nilboy): p_C = predict[:, :, self.num_classes:self.num_classes + self.boxes_per_cell] - #calculate truth x,y,sqrt_w,sqrt_h 0-D + #calculate truth x, y, sqrt_w, sqrt_h 0-D x = label[0] y = label[1] - + # TODO:为啥要使用sqrt的宽度和高度来 sqrt_w = tf.sqrt(tf.abs(label[2])) sqrt_h = tf.sqrt(tf.abs(label[3])) #sqrt_w = tf.abs(label[2]) @@ -223,6 +238,7 @@ def body1(self, num, object_num, loss, predict, labels, nilboy): #p_sqrt_h = predict_boxes[:, :, :, 3] p_sqrt_w = tf.sqrt(tf.minimum(self.image_size * 1.0, tf.maximum(0.0, predict_boxes[:, :, :, 2]))) p_sqrt_h = tf.sqrt(tf.minimum(self.image_size * 1.0, tf.maximum(0.0, predict_boxes[:, :, :, 3]))) + #calculate truth p 1-D tensor [NUM_CLASSES] P = tf.one_hot(tf.cast(label[4], tf.int32), self.num_classes, dtype=tf.float32) @@ -262,17 +278,29 @@ def loss(self, predicts, labels, objects_num): labels : 3-D tensor of [batch_size, max_objects, 5] objects_num: 1-D tensor [batch_size] """ - class_loss = tf.constant(0, tf.float32) - object_loss = tf.constant(0, tf.float32) - noobject_loss = tf.constant(0, tf.float32) - coord_loss = tf.constant(0, tf.float32) + # 定义不同的loss变量 + class_loss = tf.constant(0, tf.float32) # 分类损失 + object_loss = tf.constant(0, tf.float32) # 有对象的时候与ground truth的损失 + noobject_loss = tf.constant(0, tf.float32) # 预测框中没有对象的时候的损失 + coord_loss = tf.constant(0, tf.float32) # 预测框位置信息损失 + loss = [0, 0, 0, 0] for i in range(self.batch_size): predict = predicts[i, :, :, :] label = labels[i, :, :] object_num = objects_num[i] nilboy = tf.ones([7,7,2]) - tuple_results = tf.while_loop(self.cond1, self.body1, [tf.constant(0), object_num, [class_loss, object_loss, noobject_loss, coord_loss], predict, label, nilboy]) + # 返回值就是被调用函数题的输入数据 + tuple_results = tf.while_loop(self.cond1,# 其输入参数就是后面list中的变量 + self.body1,# 输入参数就是其后的list变量,详细见其定义 + [ tf.constant(0), + object_num, + [class_loss, object_loss, noobject_loss, coord_loss], + predict, + label, + nilboy + ]) + #累加各类loss值 for j in range(4): loss[j] = loss[j] + tuple_results[2][j] nilboy = tuple_results[5] diff --git a/yolo/solver/solver.py b/yolo/solver/solver.py index 50bb8d4..8fd65b3 100644 --- a/yolo/solver/solver.py +++ b/yolo/solver/solver.py @@ -1,3 +1,6 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + """Solver Abstract class """ class Solver(object): diff --git a/yolo/solver/yolo_solver.py b/yolo/solver/yolo_solver.py index 0d9b4f3..9a747e7 100644 --- a/yolo/solver/yolo_solver.py +++ b/yolo/solver/yolo_solver.py @@ -1,3 +1,5 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -54,6 +56,7 @@ def _train(self): def construct_graph(self): # construct graph self.global_step = tf.Variable(0, trainable=False) + # 搭建神经网络模型的输入和输出结构 self.images = tf.placeholder(tf.float32, (self.batch_size, self.height, self.width, 3)) self.labels = tf.placeholder(tf.float32, (self.batch_size, self.max_objects, 5)) self.objects_num = tf.placeholder(tf.int32, (self.batch_size)) @@ -71,6 +74,7 @@ def solve(self): init = tf.global_variables_initializer() + # Merges all summaries collected in the default graph. summary_op = tf.summary.merge_all() sess = tf.Session() @@ -83,16 +87,21 @@ def solve(self): for step in xrange(self.max_iterators): start_time = time.time() + # 获取train data np_images, np_labels, np_objects_num = self.dataset.batch() - - _, loss_value, nilboy = sess.run([self.train_op, self.total_loss, self.nilboy], feed_dict={self.images: np_images, self.labels: np_labels, self.objects_num: np_objects_num}) + # 训练模型一个batch + _, loss_value, nilboy = sess.run([self.train_op, self.total_loss, self.nilboy], + feed_dict= { + self.images: np_images, + self.labels: np_labels, + self.objects_num: np_objects_num + }) #loss_value, nilboy = sess.run([self.total_loss, self.nilboy], feed_dict={self.images: np_images, self.labels: np_labels, self.objects_num: np_objects_num}) - duration = time.time() - start_time - assert not np.isnan(loss_value), 'Model diverged with loss = NaN' + #10次迭代输入计算信息 if step % 10 == 0: num_examples_per_step = self.dataset.batch_size examples_per_sec = num_examples_per_step / duration @@ -104,9 +113,15 @@ def solve(self): examples_per_sec, sec_per_batch)) sys.stdout.flush() + # 100次迭代更新后将预测结果写入文件 if step % 100 == 0: - summary_str = sess.run(summary_op, feed_dict={self.images: np_images, self.labels: np_labels, self.objects_num: np_objects_num}) + summary_str = sess.run(summary_op, feed_dict={ + self.images: np_images, + self.labels: np_labels, + self.objects_num: np_objects_num + }) summary_writer.add_summary(summary_str, step) + # 5000次迭代保存一个模型 if step % 5000 == 0: saver2.save(sess, self.train_dir + '/model.ckpt', global_step=step) sess.close() diff --git a/yolo/utils/process_config.py b/yolo/utils/process_config.py index 93a783f..a8fbbb0 100644 --- a/yolo/utils/process_config.py +++ b/yolo/utils/process_config.py @@ -1,3 +1,9 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +""" +从配置文件中读取相应的配置参数设置 +这个方法是可以重用的 +""" import ConfigParser def process_config(conf_file):