當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

Pascal Sentences数据集预处理

發(fā)布時間：2023/12/14 编程问答 36 豆豆

生活随笔收集整理的這篇文章主要介紹了 Pascal Sentences数据集预处理小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.

Pascal Sentences^[1] 包含來自 VOC 2008^[2] 的 1000 對圖文對，多數(shù)圖有 5 句描述（但不是全部），現(xiàn)參照 [3] 處理。

Data

數(shù)據(jù)在 [1]，其中 label 是藏在 image 的鏈接路徑中（可右鍵圖片在新窗口打開看網(wǎng)址，或在瀏覽器 F12 打開調(diào)試查看元素）。

直接保存網(wǎng)頁，會下載得到 pascal-sentences.htm 文件、pascal-sentences_files/ 目錄。其中，images 裝在目錄里，sentences 嵌在 html 文件中。

原本（在瀏覽器中調(diào)試）各 images 對應的 class name 也是嵌在 html 文件里對應的 image 路徑中，但下載的 html 文件里 image 路徑卻變了（簡單指向 pascal-sentences_files/）。為得到 class name，需要在瀏覽器的頁面中查看網(wǎng)頁源碼（右鍵 view page source），然后手動復制一份 html 文件，這里存為 page-source.html。

所有文件都下在 pascal-sentences/ 里。

Sample Order

按 image ID 的升序排列數(shù)據(jù)
image 文件名形如：2008_000032.jpg，其中 2008 應該是對應 VOC 2008，忽略；000032 就當成 image ID。
寫入 id-map.pascal-sentences.txt

# make.id-map.py import os import os.path as ospP = "G:/dataset/pascal-sentences" IMAGE_P = osp.join(P, "pascal-sentences_files")# format: `2008_000032.jpg` id_key = lambda x: int(x.split(".jpg")[0].split("2008_")[1])img_files = os.listdir(IMAGE_P) img_files = sorted(img_files, key=id_key) print(img_files[:10])with open(osp.join(P, "id-map.pascal-sentences.txt"), "w") as f:for sid, img_f in enumerate(img_files):f.write("{} {}\n".format(sid, img_f))

Class Order and Labels

用 HTMLParser 解析 html 文件，參考 [4,5]
class 順序按 class name 升序排，寫入 class-name.pascal-sentences.txt，為后面 class embedding 準備。
labels 順序同 id-map.pascal-sentences.txt

# make.label.py import os import os.path as osp from html.parser import HTMLParser import numpy as np import scipy.io as sioP = "G:/dataset/pascal-sentences" HTML_F = osp.join(P, "page-source.html")with open(HTML_F, "r") as f:html_txt = f.readlines() # print(html_txt) html_txt = "".join(html_txt)class ParserLabel(HTMLParser):def __init__(self):HTMLParser.__init__(self)self.img_cls = {}self.class_set = set()def handle_starttag(self, tag, attrs):if "img" != tag:returnassert len(attrs) == 1_cls, _img_f = attrs[0][1].split("/")# print(_cls, _img_f)self.class_set.add(_cls)self.img_cls[_img_f] = _clsprint("parse annotations") parser = ParserLabel() parser.feed(html_txt)print("class order") class_set = sorted(list(parser.class_set)) cls_id = {} with open(osp.join(P, "class-name.pascal-sentences.txt"), "w") as f:for cid, c in enumerate(class_set):f.write("{} {}\n".format(cid, c))cls_id[c] = cidprint("read sample order") img_id = {} with open(osp.join(P, "id-map.pascal-sentences.txt"), "r") as f:for line in f:line = line.strip()if line:sid, img_f = line.split()img_id[img_f] = int(sid)print("label") assert len(parser.img_cls) == len(img_id) N_DATA = len(parser.img_cls) print("#data:", N_DATA) # 1000 labels = np.zeros([N_DATA], dtype=np.int32) for img, c in parser.img_cls.items():sid = img_id[img]cid = cls_id[c]labels[sid] = cid# (1000,) 0 19 9.5 9500 print("labels:", labels.shape, labels.min(), labels.max(), labels.mean(), labels.sum()) sio.savemat(osp.join(P, "labels.pascal-sentences.mat"),{"labels": labels}, do_compression=True)

Class Embeddings

class name 的 Word2Vec feature，順序同 class-name.pascal-sentences.txt。
用到 GoogleNews-vectors-negative300.bin，下載見 [6,7]。
替換了一些該 Word2Vec 模型沒有的詞(組)，見下面代碼；詞組拆成多個詞取平均。

# make.w2v.py import os import os.path as osp import numpy as np import scipy.io as sio # from gensim.models import Word2Vec from gensim.models.keyedvectors import KeyedVectors"""class Word2Vec embeddings run `make.label.py` first """P = "/home/dataset/pascal-sentences" CLASS_NAME_F = osp.join(P, "class-name.pascal-sentences.txt")# word2vec MODEL = "/home/dataset/word2vec/GoogleNews-vectors-negative300.bin"IN_DOCKER = True USER_ID = 1000cls_set = [] with open(CLASS_NAME_F, "r") as f:for line in f:line = line.strip()if line:cid, cn = line.split()if "aeroplane" == cn:cn = "airplane"elif "diningtable" == cn:cn = "dining_table"elif "pottedplant" == cn:cn = "potted_plant"elif "tvmonitor" == cn:cn = "TV_monitor"cls_set.append(cn) print("classes:", len(cls_set), cls_set)w2v = KeyedVectors.load_word2vec_format(MODEL, binary=True)print("find out the absent class names") _file_name = osp.join(P, "absent-class-name.txt") with open(_file_name, "w") as f:for c in cls_set:if c not in w2v:print(c)f.write("{}\n".format(c)) if IN_DOCKER:os.system("chown {0}:{0} {1}".format(USER_ID, _file_name))"""absent class names aeroplane -> airplane diningtable -> dining_table pottedplant -> potted_plant tvmonitor -> TV_monitor """print("class embedding") class_emb = [] for c in cls_set:if c in w2v:class_emb.append(w2v[c])else:assert "_" in c, "absent single word: {}".format(c)c_list = c.split("_")tmp = 0for _c in c_list:tmp = tmp + w2v[_c]tmp /= len(c_list)class_emb.append(tmp)class_emb = np.vstack(class_emb).astype(np.float32) # (20, 300) -0.62109375 0.62890625 -0.010865917 -65.1955 print("class emb:", class_emb.shape, class_emb.min(), class_emb.max(), class_emb.mean(), class_emb.sum())_file_name = osp.join(P, "class_emb.pascal-sentences.Gnews-300d.mat") sio.savemat(_file_name, {"class_emb": class_emb}) # because I run this script in a docker container, # I shall change the owership & group of this file for convenience os.system("chown {0}:{0} {1}".format(USER_ID, _file_name))

Images

images 的 VGG 19 feature
順序同 id-map.pascal-sentences.txt

# make.image.py import os import os.path as osp import numpy as np import scipy.io as sio # import h5py import cv2 from PIL import Image import torch import torchvision.models as models import torchvision.transforms as transformsP = "/home/dataset/pascal-sentences" IMAGE_P = osp.join(P, "pascal-sentences_files")BATCH_SIZE = 100IN_DOCKER = True USER_ID = 1000print("read sample order") id_img = {} with open(osp.join(P, "id-map.pascal-sentences.txt"), "r") as f:for line in f:line = line.strip()if line:sid, img_f = line.split()id_img[int(sid)] = img_f N = len(id_img) print("#image:", N)model = models.vgg19(pretrained=True) model.classifier = model.classifier[:-2] model = model.cuda()trsf = transforms.Compose([# transforms.Resize(224),transforms.ToTensor(),transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ])fea_list = [] with torch.no_grad():for i in range(0, N, BATCH_SIZE):image_batch = []for sid in range(i, min(i + BATCH_SIZE, N)):img_p = osp.join(IMAGE_P, id_img[sid])img = cv2.imread(img_p)#[:, :, ::-1]if img is None:img_f = Image.open(img_p)img = np.asarray(img_f)img_f.close()if 2 == img.ndim:img = np.repeat(img[:, :, np.newaxis], 3, axis=2)else:img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_LINEAR)img = trsf(Image.fromarray(img))# print(img.size())image_batch.append(img.unsqueeze(0))image_batch = torch.cat(image_batch, 0)# print(image_batch.size())image_batch = image_batch.cuda()fea = model(image_batch).cpu().numpy()# print(fea.shape)fea_list.append(fea)print(i)# if i > 0: breakFea = np.vstack(fea_list).astype(np.float32) print("image features:", Fea.shape) _f_name = osp.join(P, "images.pascal-sentences.vgg19.{}d.mat".format(Fea.shape[1])) sio.savemat(_f_name, {"images": Fea}, do_compression=True) if IN_DOCKER:os.system("chown {0}:{0} {1}".format(USER_ID, _f_name))

Texts

（2022.7.11 Notes）[8] 中文本的預處理方法已改為 Stanford CoreNLP 分詞 + lowercase，但本文的還是用 gensim.utils.simple_preprocess 的舊方法，代碼未更新（flag：以后有空更），請酌情參考 [8] 的新方法重制。
參照 [8]，5 句話拼在一起，提取 Doc2Vec features。
本節(jié)所用環(huán)境同 [8] 的對應一節(jié)：[11] 的容器創(chuàng)建 python 2.7 的虛擬環(huán)境 + [9] 的預訓練 Doc2Vec 模型 + [10] 的舊版 gensim。
python 2 的 HTMLParser 包名不同 python 3，見 [12] 和下面代碼。

# make.text.py from __future__ import print_function import os import os.path as osp from HTMLParser import HTMLParser # python 2 # from html.parser import HTMLParser # python 3 import numpy as np import scipy.io as sio import gensim from gensim.models import Doc2Vec"""text Doc2Vec feature run `make.id-map.py` before this file """P = "/home/dataset/pascal-sentences" HTML_F = osp.join(P, "page-source.html")# doc2vec MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin" DIM = 300 # dimension of the doc2vec featureIN_DOCKER = True USER_ID = 1000with open(HTML_F, "r") as f:html_txt = f.readlines() # print(html_txt) html_txt = "".join(html_txt)class ParserText(HTMLParser):def __init__(self):HTMLParser.__init__(self)self.tr_layer = 0self.current_img = Noneself.current_txt = []self.current_tag = Noneself.img_txt = {}def handle_starttag(self, tag, attrs):self.current_tag = tagif "img" == tag:assert len(attrs) == 1_cls, _img_f = attrs[0][1].split("/")# print(_cls, _img_f)self.current_img = _img_felif "tr" == tag:self.tr_layer += 1def handle_endtag(self, tag):if "tr" == tag:self.tr_layer -= 1if 0 == self.tr_layer:# assert 5 == len(self.current_txt)self.current_txt = "".join(self.current_txt)self.img_txt[self.current_img] = self.current_txt# print('\t', self.current_img, '\n', self.current_txt)self.current_img = Noneself.current_txt = []self.current_tag = Nonedef handle_data(self, data):if (2 == self.tr_layer) and ("td" == self.current_tag):# print(data)self.current_txt.append(data)print("parse sentences") parser = ParserText() parser.feed(html_txt)print("read sample order") img_id = {} with open(osp.join(P, "id-map.pascal-sentences.txt"), "r") as f:for line in f:line = line.strip()if line:sid, img_f = line.split()img_id[img_f] = int(sid)print("text") assert len(parser.img_txt) == len(img_id) N_DATA = len(parser.img_txt) print("#data:", N_DATA) # 1000 texts = np.zeros([N_DATA, DIM], dtype=np.float32)model = Doc2Vec.load(MODEL)for img, txt in parser.img_txt.items():sid = img_id[img]doc = gensim.utils.simple_preprocess(doc)vec = model.infer_vector(doc)texts[sid] = vec # (1000, 300) -0.6507467 0.6664893 -0.0071584913 -2147.5474 print("texts:", texts.shape, texts.min(), texts.max(), texts.mean(), texts.sum())_f_name = osp.join(P, "texts.pascal-sentences.doc2vec.{}.mat".format(DIM)) sio.savemat(_f_name, {"texts": texts}) if IN_DOCKER:os.system("chown {0}:{0} {1}".format(USER_ID, _f_name))

Cloud Drive

百度網(wǎng)盤：https://pan.baidu.com/s/1QfyhxPLjPfQS5JdHWh4HTQ，提取碼：lwbd。

References

Pascal Sentences

Visual Object Classes Challenge 2008 (VOC2008)

TCSVT 2020 | Zero-Shot Cross-Media Embedding Learning With Dual Adversarial Distribution Network

html.parser — 簡單的 HTML 和 XHTML 解析器 <- python 3

Python HTML操作（HTMLParser）

GoogleNews-vectors-negative300.bin.gz

nishankmahore/word2vec-flask-api

MS COCO 2017數(shù)據(jù)集預處理

jhlau/doc2vec

jhlau/gensim

pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime

HTMLParser — Simple HTML and XHTML parser <- python 2

總結(jié)

以上是生活随笔為你收集整理的Pascal Sentences数据集预处理的全部內(nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯，歡迎將生活随笔推薦給好友。

上一篇： STM32 汇编程序——串口输出 Hel
下一篇：单片机 c语言 p1控制流水灯,单片机控