Tensorflow实现将标签变为one-hot形式

Ursula ·

更新时间:2024-11-15

· 635 次阅读

将数据标签变为类似MNIST的one-hot编码形式


def one_hot(indices, 
 depth, 
 on_value=None, 
 off_value=None, 
 axis=None, 
 dtype=None, 
 name=None):
 """Returns a one-hot tensor.
 The locations represented by indices in `indices` take value 
 `on_value`,
 while all other locations take value `off_value`.
 `on_value` and `off_value` must have matching data types. If 
 `dtype` is also
 provided, they must be the same data type as specified by 
 `dtype`.
 If `on_value` is not provided, it will default to the value `1` with 
 type
 `dtype`
 If `off_value` is not provided, it will default to the value `0` with 
 type
 `dtype`
 If the input `indices` is rank `N`, the output will have rank 
 `N+1`. The
 new axis is created at dimension `axis` (default: the new axis is 
 appended
 at the end).
 If `indices` is a scalar the output shape will be a vector of 
 length `depth`
 If `indices` is a vector of length `features`, the output shape will 
 be:
 ```
 features x depth if axis == -1
 depth x features if axis == 0
 ```
 If `indices` is a matrix (batch) with shape `[batch, features]`, the 
 output
 shape will be:
 ```
 batch x features x depth if axis == -1
 batch x depth x features if axis == 1
 depth x batch x features if axis == 0
 ```
 If `dtype` is not provided, it will attempt to assume the data 
 type of
 `on_value` or `off_value`, if one or both are passed in. If none 
 of
 `on_value`, `off_value`, or `dtype` are provided, `dtype` will 
 default to the
 value `tf.float32`.
 Note: If a non-numeric data type output is desired (`tf.string`, 
 `tf.bool`,
 etc.), both `on_value` and `off_value` _must_ be provided to 
 `one_hot`.
 For example:
 ```python
 indices = [0, 1, 2]
 depth = 3
 tf.one_hot(indices, depth) # output: [3 x 3]
 # [[1., 0., 0.],
 # [0., 1., 0.],
 # [0., 0., 1.]]
 indices = [0, 2, -1, 1]
 depth = 3
 tf.one_hot(indices, depth,
 on_value=5.0, off_value=0.0,
 axis=-1) # output: [4 x 3]
 # [[5.0, 0.0, 0.0], # one_hot(0)
 # [0.0, 0.0, 5.0], # one_hot(2)
 # [0.0, 0.0, 0.0], # one_hot(-1)
 # [0.0, 5.0, 0.0]] # one_hot(1)
 indices = [[0, 2], [1, -1]]
 depth = 3
 tf.one_hot(indices, depth,
 on_value=1.0, off_value=0.0,
 axis=-1) # output: [2 x 2 x 3]
 # [[[1.0, 0.0, 0.0], # one_hot(0)
 # [0.0, 0.0, 1.0]], # one_hot(2)
 # [[0.0, 1.0, 0.0], # one_hot(1)
 # [0.0, 0.0, 0.0]]] # one_hot(-1)
 ```
 Args:
 indices: A `Tensor` of indices.
 depth: A scalar defining the depth of the one hot dimension.
 on_value: A scalar defining the value to fill in output when 
 `indices[j]
 = i`. (default: 1)
 off_value: A scalar defining the value to fill in output when 
 `indices[j]
 != i`. (default: 0)
 axis: The axis to fill (default: -1, a new inner-most axis).
 dtype: The data type of the output tensor.
 Returns:
 output: The one-hot tensor.
 Raises:
 TypeError: If dtype of either `on_value` or `off_value` don't 
 match `dtype`
 TypeError: If dtype of `on_value` and `off_value` don't match 
 one another
 """
 with ops.name_scope(name, "one_hot", 
 [indices, depth, on_value, off_value, axis, 
  dtype]) as name:
 on_exists = on_value is not None
 off_exists = off_value is not None
 on_dtype = ops.convert_to_tensor(on_value).dtype.base_dtype 
  if on_exists else None
 off_dtype = ops.convert_to_tensor(off_value).dtype.
  base_dtype if off_exists else None
 if on_exists or off_exists:
  if dtype is not None:
  # Ensure provided on_value and/or off_value match dtype
  if (on_exists and on_dtype != dtype):
   raise TypeError("dtype {0} of on_value does not match "
   "dtype parameter {1}".format(on_dtype, dtype))
  if (off_exists and off_dtype != dtype):
   raise TypeError("dtype {0} of off_value does not match "
   "dtype parameter {1}".format(off_dtype, dtype))
  else:
  # dtype not provided: automatically assign it
  dtype = on_dtype if on_exists else off_dtype
 elif dtype is None:
  # None of on_value, off_value, or dtype provided. Default 
  dtype to float32
  dtype = dtypes.float32
 if not on_exists:
  # on_value not provided: assign to value 1 of type dtype
  on_value = ops.convert_to_tensor(1, dtype, name="
  on_value")
  on_dtype = dtype
 if not off_exists:
  # off_value not provided: assign to value 0 of type dtype
  off_value = ops.convert_to_tensor(0, dtype, name="
  off_value")
  off_dtype = dtype
 if on_dtype != off_dtype:
  raise TypeError("dtype {0} of on_value does not match "
  "dtype {1} of off_value".format(on_dtype, off_dtype))
 return gen_array_ops._one_hot(indices, depth, on_value, 
  off_value, axis, 
  name)
Enter: apply completion.
 + Ctrl: remove arguments and replace current word (no Pop-
 up focus).
 + Shift: remove arguments (requires Pop-up focus).


import tensorflow as tf
import numpy as np
data = np.linspace(0,9,10)
label = tf.one_hot(data,10)
with tf.Session() as sess:
 print(data)
 print(sess.run(label))

补充知识：数据清洗—制作one-hot

使用pandas进行one-hot编码

pandas.get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)

pandas中get_dummies()函数可以将字段进行编码，转换为01形式，其中prefix可以为每个新展开的列名添加前缀。

但是，笔者发现它较易使用在数据为每一列为单独的字符：


df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]})
## one-hot
df_dumm = pd.get_dummies(df)

my_one_hot

但是对于数据为下面形式的可就不能直接转换了，需要先预处理一下，之后转换为one-hot形式：

我的做法是:


## tqdm_notebook可以导入tqdm包来使用
def one_hot_my(dataframe, attri):
 sample_attri_list = []
 sample_attri_loc_dic = {}
 loc = 0
 dataframe[attri] = dataframe[attri].astype(str)
 for attri_id in tqdm_notebook(dataframe[attri]):
  attri_id_pro = attri_id.strip().split(',')
  for key in attri_id_pro:
   if key not in sample_attri_loc_dic.keys():
    sample_attri_loc_dic[key] = loc
    loc+=1
  sample_attri_list.append(attri_id_pro)
 print("开始完成one-hot.......")  
 one_hot_attri = []
 for attri_id in tqdm_notebook(sample_attri_list):
  array = [0 for _ in range(len(sample_attri_loc_dic.keys()))]
  for key in attri_id:
   array[sample_attri_loc_dic[key]] = 1
  one_hot_attri.append(array)
 print("封装成dataframe.......") 
 ## 封装成dataframe
 columns = [attri+x for x in sample_attri_loc_dic.keys()]
 one_hot_rig_id_df = pd.DataFrame(one_hot_attri,columns=columns)
 return one_hot_rig_id_df

对属性二值化可以采用：


## 对属性进行二值化
def binary_apply(key, attri, dataframe):
 key_modify = 'is_' + ''.join(lazy_pinyin(key)) + '_' + attri
 print(key_modify)
 dataframe[key_modify] = dataframe.apply(lambda x:1 if x[attri]== key else 0, axis=1)
 return dataframe

对字符进行编码,将字符转换为0，1，2…：


## 对字符进行编码
# columns = ['job', 'marital', 'education','default','housing' ,'loan','contact', 'poutcome']
def encode_info(dataframe, columns):
 for col in columns:
  print(col)
  dataframe[col] = pd.factorize(dataframe[col])[0]
 return dataframe