Tensorflow Precision / Recall / F1 score y matriz de confusión

Realmente no necesita sklearn para calcular la precisión / recuperación / puntuación f1. Puede expresarlos fácilmente de manera TF-ish mirando las fórmulas:

Ahora si tienes tu actual y predicted valores como vectores de 0/1, puede calcular TP, TN, FP, FN usando tf.count_nonzero:

TP = tf.count_nonzero(predicted * actual)
TN = tf.count_nonzero((predicted - 1) * (actual - 1))
FP = tf.count_nonzero(predicted * (actual - 1))
FN = tf.count_nonzero((predicted - 1) * actual)

Ahora sus métricas son fáciles de calcular:

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * precision * recall / (precision + recall)

Quizás este ejemplo te hable:

    pred = multilayer_perceptron(x, weights, biases)
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

    with tf.Session() as sess:
    init = tf.initialize_all_variables()
    for epoch in xrange(150):
            for i in xrange(total_batch):
           = x: train_arrays, y: train_labels)
                    avg_cost +=, feed_dict=x: train_arrays, y: train_labels)/total_batch         
            if epoch % display_step == 0:
                    print "Epoch:", '%04d' % (epoch+1), "cost=", ":.9f".format(avg_cost)

    y_p = tf.argmax(pred, 1)
    val_accuracy, y_pred =[accuracy, y_p], feed_dict=x:test_arrays, y:test_label)

    print "validation accuracy:", val_accuracy
    y_true = np.argmax(test_label,1)
    print "Precision", sk.metrics.precision_score(y_true, y_pred)
    print "Recall", sk.metrics.recall_score(y_true, y_pred)
    print "f1_score", sk.metrics.f1_score(y_true, y_pred)
    print "confusion_matrix"
    print sk.metrics.confusion_matrix(y_true, y_pred)
    fpr, tpr, tresholds = sk.metrics.roc_curve(y_true, y_pred)

Estuche de etiquetas múltiples

Las respuestas anteriores no especifican cómo manejar el caso de múltiples etiquetas, por lo que aquí hay una versión que implementa tres tipos de puntuación f1 de etiquetas múltiples en tensorflow: micro, macro y ponderado (según scikit-learn)

Actualización (06/06/18): Escribí una publicación de blog sobre cómo calcular el streaming multilabel puntuación de f1 en caso de que ayude a alguien (es un proceso más largo, no quiero sobrecargar esta respuesta)

f1s = [0, 0, 0]

y_true = tf.cast(y_true, tf.float64)
y_pred = tf.cast(y_pred, tf.float64)

for i, axis in enumerate([None, 0]):
    TP = tf.count_nonzero(y_pred * y_true, axis=axis)
    FP = tf.count_nonzero(y_pred * (y_true - 1), axis=axis)
    FN = tf.count_nonzero((y_pred - 1) * y_true, axis=axis)

    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1 = 2 * precision * recall / (precision + recall)

    f1s[i] = tf.reduce_mean(f1)

weights = tf.reduce_sum(y_true, axis=0)
weights /= tf.reduce_sum(weights)

f1s[2] = tf.reduce_sum(f1 * weights)

micro, macro, weighted = f1s


def tf_f1_score(y_true, y_pred):
    """Computes 3 different f1 scores, micro macro
    micro: f1 score accross the classes, as 1
    macro: mean of f1 scores per class
    weighted: weighted average of f1 scores per class,
            weighted from the support of each class

        y_true (Tensor): labels, with shape (batch, num_classes)
        y_pred (Tensor): model's predictions, same shape as y_true

        tuple(Tensor): (micro, macro, weighted)
                    tuple of the computed f1 scores

    f1s = [0, 0, 0]

    y_true = tf.cast(y_true, tf.float64)
    y_pred = tf.cast(y_pred, tf.float64)

    for i, axis in enumerate([None, 0]):
        TP = tf.count_nonzero(y_pred * y_true, axis=axis)
        FP = tf.count_nonzero(y_pred * (y_true - 1), axis=axis)
        FN = tf.count_nonzero((y_pred - 1) * y_true, axis=axis)

        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        f1 = 2 * precision * recall / (precision + recall)

        f1s[i] = tf.reduce_mean(f1)

    weights = tf.reduce_sum(y_true, axis=0)
    weights /= tf.reduce_sum(weights)

    f1s[2] = tf.reduce_sum(f1 * weights)

    micro, macro, weighted = f1s
    return micro, macro, weighted

def compare(nb, dims):
    labels = (np.random.randn(nb, dims) > 0.5).astype(int)
    predictions = (np.random.randn(nb, dims) > 0.5).astype(int)

    stime = time()
    mic = f1_score(labels, predictions, average='micro')
    mac = f1_score(labels, predictions, average='macro')
    wei = f1_score(labels, predictions, average='weighted')

    print('sklearn in :.4f:n    micro: :.8fn    macro: :.8fn    weighted: :.8f'.format(
        time() - stime, mic, mac, wei

    gtime = time()
    y_true = tf.Variable(labels)
    y_pred = tf.Variable(predictions)
    micro, macro, weighted = tf_f1_score(y_true, y_pred)
    with tf.Session() as sess:
        stime = time()
        mic, mac, wei =[micro, macro, weighted])
        print('tensorflow in :.4f (:.4f with graph time):n    micro: :.8fn    macro: :.8fn    weighted: :.8f'.format(
            time() - stime, time()-gtime,  mic, mac, wei

compare(10 ** 6, 10)


>> rows: 10^6 dimensions: 10
sklearn in 2.3939:
    micro: 0.30890287
    macro: 0.30890275
    weighted: 0.30890279
tensorflow in 0.2465 (3.3246 with graph time):
    micro: 0.30890287
    macro: 0.30890275
    weighted: 0.30890279

