Skip to content

Commit 8f7c9df

Browse files
committed
working codes
1 parent f8237b3 commit 8f7c9df

File tree

3 files changed

+129
-1
lines changed

3 files changed

+129
-1
lines changed

README.md

+44
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,46 @@
11
# java-word-embedding
2+
23
Word embedding in Java
4+
5+
The current project provides GloVe word embedding that developer can directly use within their project.
6+
7+
# Usage
8+
9+
The [sample codes](src/main/java/com/github/chen0040/embeddings/GloVeModelDemo.java) below shows how to use
10+
[GloVeModel](src/main/java/com/github/chen0040/embeddings/GloVeModel.java) to create GloVe word embedding of different
11+
dimensions (e.g., 50, 100, 200, 300)
12+
13+
```java
14+
15+
import org.slf4j.Logger;
16+
import org.slf4j.LoggerFactory;
17+
import com.github.chen0040.embeddings.GloVeModel;
18+
19+
public class GloVeModelDemo {
20+
21+
private static final Logger logger = LoggerFactory.getLogger(GloVeModelDemo.class);
22+
23+
public static void main(String[] args) {
24+
GloVeModel model = new GloVeModel();
25+
model.load100();
26+
27+
logger.info("word2em size: {}", model.size());
28+
logger.info("word2em dimension for individual word: {}", model.getWordVecDimension());
29+
30+
logger.info("father: {}", model.encodeWord("father"));
31+
logger.info("mother: {}", model.encodeWord("mother"));
32+
logger.info("man: {}", model.encodeWord("man"));
33+
logger.info("woman: {}", model.encodeWord("woman"));
34+
logger.info("boy: {}", model.encodeWord("boy"));
35+
logger.info("girl: {}", model.encodeWord("girl"));
36+
37+
logger.info("distance between boy and girl: {}", model.distance("boy", "girl"));
38+
39+
40+
String doc = "The Zen of Python. Beautiful is better than ugly. Explicit is better than implicit. Simple is better than complex. Complex is better than complicated. Flat is better than nested. Sparse is better than dense. Readability counts. Special cases aren't special enough to break the rules.";
41+
42+
logger.info("doc: {}", model.encodeDocument(doc));
43+
44+
45+
}
46+
```

src/main/java/com/github/chen0040/embeddings/GloVeModel.java

+60
Original file line numberDiff line numberDiff line change
@@ -55,21 +55,59 @@ public float[] encodeWord(String word) {
5555
return null;
5656
}
5757

58+
public float[] encodeDocument(String sentence) {
59+
sentence = filter(sentence);
60+
String[] words = sentence.split(" ");
61+
62+
float[] vec = new float[dimension];
63+
for(String word: words) {
64+
String w = word.trim();
65+
if(w.equals("")){
66+
continue;
67+
}
68+
float[] word2vec = encodeWord(w);
69+
if(word2vec == null) continue;
70+
for(int i=0; i < dimension; ++i){
71+
vec[i] += word2vec[i];
72+
}
73+
}
74+
75+
return vec;
76+
77+
}
78+
79+
private String filter(String sent) {
80+
sent = sent.toLowerCase();
81+
String[] punctuations = new String[] {",", ".", ";", ":", "?", "!", "\"", "'"};
82+
for(String punt : punctuations) {
83+
sent = sent.replace(punt, " " + punt);
84+
}
85+
return sent;
86+
}
87+
5888
public int size() {
5989
return word2em.size();
6090
}
6191

92+
public int getWordVecDimension() {
93+
return dimension;
94+
}
95+
6296
public Map<String, float[]> load(String dirPath, int dimension){
6397
this.dimension = -1;
6498
word2em.clear();
6599
String sourceFile100 = getGloVeTextFileName(dimension);
66100
String filePath = dirPath + "/" + sourceFile100;
67101
File file = new File(filePath);
68102
if(!file.exists()){
103+
69104
String zipFilePath = dirPath + "/glove.6B.zip";
70105
if(!new File(zipFilePath).exists()) {
106+
logger.info("{} not found on local machine, downloading it from {}", zipFilePath, url);
71107
if (!HttpClient.downloadFile(url, zipFilePath)) {
72108
return word2em;
109+
} else {
110+
logger.info("{} is downloaded", zipFilePath);
73111
}
74112
}
75113

@@ -78,6 +116,8 @@ public Map<String, float[]> load(String dirPath, int dimension){
78116
}
79117
}
80118

119+
logger.info("loading {} into word2em", filePath);
120+
81121
try(BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(filePath))))){
82122
String line;
83123
while((line=reader.readLine()) != null) {
@@ -103,6 +143,7 @@ public Map<String, float[]> load(String dirPath, int dimension){
103143
}
104144

105145
private boolean unZip(String zipFilePath, String dirPath) {
146+
logger.info("unzipping {} to {}", zipFilePath, dirPath);
106147
try {
107148
ZipFile zipFile = new ZipFile(zipFilePath);
108149
zipFile.extractAll(dirPath);
@@ -113,4 +154,23 @@ private boolean unZip(String zipFilePath, String dirPath) {
113154
return false;
114155
}
115156
}
157+
158+
public double distance(String word1, String word2) {
159+
float[] vec1 = encodeWord(word1);
160+
float[] vec2 = encodeWord(word2);
161+
162+
if(vec1 == null || vec2 == null) {
163+
return -1f;
164+
}
165+
166+
float result = 0;
167+
for(int i=0; i < dimension; ++i) {
168+
float v1 = vec1[i];
169+
float v2 = vec2[i];
170+
171+
result += (v1 - v2) * (v1 - v2);
172+
}
173+
174+
return Math.sqrt(result);
175+
}
116176
}
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,34 @@
11
package com.github.chen0040.embeddings;
22

3+
import org.slf4j.Logger;
4+
import org.slf4j.LoggerFactory;
5+
36
public class GloVeModelDemo {
7+
8+
private static final Logger logger = LoggerFactory.getLogger(GloVeModelDemo.class);
9+
410
public static void main(String[] args) {
511
GloVeModel model = new GloVeModel();
612
model.load100();
7-
System.out.println("word2em size: " + model.size());
13+
14+
logger.info("word2em size: {}", model.size());
15+
logger.info("word2em dimension for individual word: {}", model.getWordVecDimension());
16+
17+
logger.info("father: {}", model.encodeWord("father"));
18+
logger.info("mother: {}", model.encodeWord("mother"));
19+
logger.info("man: {}", model.encodeWord("man"));
20+
logger.info("woman: {}", model.encodeWord("woman"));
21+
logger.info("boy: {}", model.encodeWord("boy"));
22+
logger.info("girl: {}", model.encodeWord("girl"));
23+
24+
logger.info("distance between boy and girl: {}", model.distance("boy", "girl"));
25+
26+
27+
String doc = "The Zen of Python. Beautiful is better than ugly. Explicit is better than implicit. Simple is better than complex. Complex is better than complicated. Flat is better than nested. Sparse is better than dense. Readability counts. Special cases aren't special enough to break the rules.";
28+
29+
logger.info("doc: {}", model.encodeDocument(doc));
30+
31+
832
}
933

1034
}

0 commit comments

Comments
 (0)