直接保存網(wǎng)頁,會下載得到 pascal-sentences.htm 文件、pascal-sentences_files/ 目錄。其中,images 裝在目錄里,sentences 嵌在 html 文件中。
原本(在瀏覽器中調(diào)試)各 images 對應的 class name 也是嵌在 html 文件里對應的 image 路徑中,但下載的 html 文件里 image 路徑卻變了(簡單指向 pascal-sentences_files/)。為得到 class name,需要在瀏覽器的頁面中查看網(wǎng)頁源碼(右鍵 view page source),然后手動復制一份 html 文件,這里存為 page-source.html。
class name 的 Word2Vec feature,順序同 class-name.pascal-sentences.txt。
用到 GoogleNews-vectors-negative300.bin,下載見 [6,7]。
替換了一些該 Word2Vec 模型沒有的詞(組),見下面代碼;詞組拆成多個詞取平均。
# make.w2v.pyimport os
import os.path as osp
import numpy as np
import scipy.io as sio
# from gensim.models import Word2Vecfrom gensim.models.keyedvectors import KeyedVectors"""class Word2Vec embeddings
run `make.label.py` first
"""P ="/home/dataset/pascal-sentences"
CLASS_NAME_F = osp.join(P,"class-name.pascal-sentences.txt")# word2vec
MODEL ="/home/dataset/word2vec/GoogleNews-vectors-negative300.bin"IN_DOCKER =True
USER_ID =1000cls_set =[]withopen(CLASS_NAME_F,"r")as f:for line in f:line = line.strip()if line:cid, cn = line.split()if"aeroplane"== cn:cn ="airplane"elif"diningtable"== cn:cn ="dining_table"elif"pottedplant"== cn:cn ="potted_plant"elif"tvmonitor"== cn:cn ="TV_monitor"cls_set.append(cn)print("classes:",len(cls_set), cls_set)w2v = KeyedVectors.load_word2vec_format(MODEL, binary=True)print("find out the absent class names")
_file_name = osp.join(P,"absent-class-name.txt")withopen(_file_name,"w")as f:for c in cls_set:if c notin w2v:print(c)f.write("{}\n".format(c))if IN_DOCKER:os.system("chown {0}:{0} {1}".format(USER_ID, _file_name))"""absent class names
aeroplane -> airplane
diningtable -> dining_table
pottedplant -> potted_plant
tvmonitor -> TV_monitor
"""print("class embedding")
class_emb =[]for c in cls_set:if c in w2v:class_emb.append(w2v[c])else:assert"_"in c,"absent single word: {}".format(c)c_list = c.split("_")tmp =0for _c in c_list:tmp = tmp + w2v[_c]tmp /=len(c_list)class_emb.append(tmp)class_emb = np.vstack(class_emb).astype(np.float32)# (20, 300) -0.62109375 0.62890625 -0.010865917 -65.1955print("class emb:", class_emb.shape, class_emb.min(), class_emb.max(), class_emb.mean(), class_emb.sum())_file_name = osp.join(P,"class_emb.pascal-sentences.Gnews-300d.mat")
sio.savemat(_file_name,{"class_emb": class_emb})# because I run this script in a docker container,# I shall change the owership & group of this file for convenience
os.system("chown {0}:{0} {1}".format(USER_ID, _file_name))
Images
images 的 VGG 19 feature
順序同 id-map.pascal-sentences.txt
# make.image.pyimport os
import os.path as osp
import numpy as np
import scipy.io as sio
# import h5pyimport cv2
from PIL import Image
import torch
import torchvision.models as models
import torchvision.transforms as transformsP ="/home/dataset/pascal-sentences"
IMAGE_P = osp.join(P,"pascal-sentences_files")BATCH_SIZE =100IN_DOCKER =True
USER_ID =1000print("read sample order")
id_img ={}withopen(osp.join(P,"id-map.pascal-sentences.txt"),"r")as f:for line in f:line = line.strip()if line:sid, img_f = line.split()id_img[int(sid)]= img_f
N =len(id_img)print("#image:", N)model = models.vgg19(pretrained=True)
model.classifier = model.classifier[:-2]
model = model.cuda()trsf = transforms.Compose([# transforms.Resize(224),transforms.ToTensor(),transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),])fea_list =[]with torch.no_grad():for i inrange(0, N, BATCH_SIZE):image_batch =[]for sid inrange(i,min(i + BATCH_SIZE, N)):img_p = osp.join(IMAGE_P, id_img[sid])img = cv2.imread(img_p)#[:, :, ::-1]if img isNone:img_f = Image.open(img_p)img = np.asarray(img_f)img_f.close()if2== img.ndim:img = np.repeat(img[:,:, np.newaxis],3, axis=2)else:img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)img = cv2.resize(img,(224,224), interpolation=cv2.INTER_LINEAR)img = trsf(Image.fromarray(img))# print(img.size())image_batch.append(img.unsqueeze(0))image_batch = torch.cat(image_batch,0)# print(image_batch.size())image_batch = image_batch.cuda()fea = model(image_batch).cpu().numpy()# print(fea.shape)fea_list.append(fea)print(i)# if i > 0: breakFea = np.vstack(fea_list).astype(np.float32)print("image features:", Fea.shape)
_f_name = osp.join(P,"images.pascal-sentences.vgg19.{}d.mat".format(Fea.shape[1]))
sio.savemat(_f_name,{"images": Fea}, do_compression=True)if IN_DOCKER:os.system("chown {0}:{0} {1}".format(USER_ID, _f_name))