Skip to content

Commit 950d9a4

Browse files
committed
Issue #59: country and location bias
1 parent ac828db commit 950d9a4

File tree

5 files changed

+151
-47
lines changed

5 files changed

+151
-47
lines changed

Core/src/main/java/org/opensextant/processing/Parameters.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
package org.opensextant.processing;
1818

1919
import java.util.Date;
20+
import java.util.HashMap;
2021
import java.util.HashSet;
22+
import java.util.List;
2123
import java.util.Set;
2224

2325
import org.joda.time.format.DateTimeFormat;
@@ -81,6 +83,15 @@ public class Parameters extends java.util.Properties {
8183
public String outputFile = null;
8284

8385
private Set<String> formats = new HashSet<String>();
86+
87+
/**
88+
* A way of relaying arbitrary geographic filters to an extraction routine indicating that useful answers for
89+
* disambiguation for tie-breakers come from these cues.
90+
*
91+
* "countries" = [c1, c2, c3, ...]
92+
* "geohash" = [g1, g2, g3, ...]
93+
*/
94+
public HashMap<String, List<String>> preferredGeography = new HashMap<>();
8495

8596
/** You the caller must explicitly set isdefault = false;
8697
* forcing you to actually look at these parameters.

src/main/java/org/opensextant/extractors/geo/PlaceGeocoder.java

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,16 @@ private void reset() {
441441
private boolean geocode = true;
442442
private boolean tagOnly = !geocode;
443443

444+
445+
/**
446+
* See {@link #extract(TextInput, Parameters)} below.
447+
* This is the default extraction routine. If you need to tune extraction call <code>extract( input, parameters ) </code>
448+
*/
449+
@Override
450+
public List<TextMatch> extract(TextInput input) throws ExtractionException {
451+
return extract(input, null);
452+
}
453+
444454
/**
445455
* Extractor.extract() calls first XCoord to get coordinates, then PlacenameMatcher In the end you
446456
* have all geo entities ranked and scored.
@@ -462,10 +472,13 @@ private void reset() {
462472
* @return TextMatch instances which are all PlaceCandidates.
463473
* @throws ExtractionException on err
464474
*/
465-
@Override
466-
public List<TextMatch> extract(TextInput input) throws ExtractionException {
475+
public List<TextMatch> extract(TextInput input, Parameters jobParams) throws ExtractionException {
467476
long t1 = System.currentTimeMillis();
468477
reset();
478+
479+
if (jobParams != null) {
480+
this.setAllowLowerCase(jobParams.tag_lowercase);
481+
}
469482

470483
List<TextMatch> matches = new ArrayList<TextMatch>();
471484
List<TextMatch> coordinates = null;
@@ -522,7 +535,7 @@ public List<TextMatch> extract(TextInput input) throws ExtractionException {
522535
// Last rule: score, choose, add confidence.
523536
//
524537
chooser.setTextCase(input.isLower ? GeocodeRule.LOWERCASE : 0);
525-
chooser.evaluate(candidates);
538+
chooser.evaluate(candidates, jobParams);
526539
if (provinceNameSetter != null) {
527540
provinceNameSetter.evaluate(candidates);
528541
}

src/main/java/org/opensextant/extractors/geo/rules/LocationChooserRule.java

Lines changed: 68 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package org.opensextant.extractors.geo.rules;
22

33
import java.util.HashMap;
4+
import java.util.HashSet;
45
import java.util.List;
56
import java.util.Map;
67

@@ -11,6 +12,7 @@
1112
import org.opensextant.extractors.geo.PlaceCount;
1213
import org.opensextant.extractors.geo.PlaceEvidence;
1314
import org.opensextant.extractors.geo.PlaceGeocoder;
15+
import org.opensextant.processing.Parameters;
1416
import org.opensextant.util.GeodeticUtility;
1517

1618
/**
@@ -34,6 +36,8 @@ public class LocationChooserRule extends GeocodeRule {
3436
private Map<String, PlaceCount> boundaryContext = null;
3537
private Map<String, PlaceCount> namespace = new HashMap<>();
3638
private HashMap<String, CountryCount> inferredCountries = new HashMap<>();
39+
private HashSet<String> preferredCountries = new HashSet<>();
40+
private HashSet<String> preferredLocations = new HashSet<>();
3741

3842
private int textCase = 0;
3943

@@ -53,12 +57,19 @@ public void reset() {
5357
documentCandidates.clear();
5458
namespace.clear();
5559
inferredCountries.clear();
60+
preferredCountries.clear();
61+
preferredLocations.clear();
62+
}
63+
64+
@Override
65+
public void evaluate(List<PlaceCandidate> names) {
66+
evaluate(names, (Parameters) null);
5667
}
5768

5869
/**
5970
* Walk the entire list.
6071
*/
61-
public void evaluate(List<PlaceCandidate> names) {
72+
public void evaluate(List<PlaceCandidate> names, Parameters preferences) {
6273

6374
// INPUTS:
6475
// histogram of country mentions
@@ -71,6 +82,16 @@ public void evaluate(List<PlaceCandidate> names) {
7182
//
7283
countryContext = countryObserver.countryMentionCount();
7384
boundaryContext = boundaryObserver.placeMentionCount();
85+
//
86+
// PREFS:
87+
if (preferences != null) {
88+
if (preferences.preferredGeography.containsKey("countries")) {
89+
preferredCountries.addAll(preferences.preferredGeography.get("countries"));
90+
}
91+
if (preferences.preferredGeography.containsKey("geohashes")) {
92+
preferredLocations.addAll(preferences.preferredGeography.get("geohashes"));
93+
}
94+
}
7495

7596
/* TODO: DEBUG through location chooser using histograms
7697
* of found and resolved place metadata.
@@ -190,6 +211,13 @@ private void debuggingHistograms(List<PlaceCandidate> names) {
190211
*/
191212
private static final int GLOBAL_POINTS = 5;
192213

214+
/**
215+
* Preferred Country or Location -- when user supplies the context that may be missing.... We accept
216+
* that and weight such preference higher.
217+
*/
218+
public static String PREF_COUNTRY = "PreferredCountry";
219+
public static String PREF_LOCATION = "PreferredLocation";
220+
193221
/**
194222
* Yet unchosen location. Consider given evidence first, creating some weight there, then
195223
* introducing innate properties of possible locations, thereby amplifying the differences in the
@@ -199,7 +227,30 @@ private void debuggingHistograms(List<PlaceCandidate> names) {
199227
@Override
200228
public void evaluate(PlaceCandidate name, Place geo) {
201229

230+
// With "preferred geography" we can influence in a subtle fashion ambiguous mentions, e.g.,
231+
// If known geography is Ohio and we see mentions of Springfield without other context, we can
232+
// nudge choice of Springfield, OH as such. Such as with a preferred location (geohash).
233+
234+
if (preferredCountries != null && !preferredCountries.isEmpty()) {
235+
if (preferredCountries.contains(geo.getCountryCode())) {
236+
// Get a half-point for being within the country
237+
name.incrementPlaceScore(geo, 0.5);
238+
name.addRule(PREF_COUNTRY);
239+
}
240+
}
241+
if (preferredLocations != null && !preferredLocations.isEmpty()) {
242+
for (String gh : preferredLocations) {
243+
if (geo.getGeohash().startsWith(gh)) {
244+
// Increment a full point for being within the geohash. Note geohash length of 4 or more chars is reasonably good resolution.
245+
name.incrementPlaceScore(geo, 1.0);
246+
name.addRule(PREF_LOCATION);
247+
}
248+
}
249+
}
250+
202251
if (boundaryContext.isEmpty() && countryContext.isEmpty()) {
252+
// So without context, there is nothing more we can do to influence the connection between
253+
// the one named place and the candidate location
203254
return;
204255
}
205256

@@ -275,8 +326,8 @@ public void evaluate(PlaceCandidate name, Place geo) {
275326
public static final int MATCHCONF_NAME_REGION = 75;
276327

277328
/**
278-
* Absolute Confidence: Unique name in gazetteer.
279-
* Confidence is high, however this needs to be tempered by the number of gazetteers, coverage, and diversity
329+
* Absolute Confidence: Unique name in gazetteer. Confidence is high, however this needs to be
330+
* tempered by the number of gazetteers, coverage, and diversity
280331
*/
281332
public static final int MATCHCONF_ONE_LOC = 70;
282333

@@ -309,6 +360,12 @@ public void evaluate(PlaceCandidate name, Place geo) {
309360
*/
310361
public static final int MATCHCONF_QUALIFIER_LOWERCASE = -15;
311362

363+
/**
364+
* A subtle boost for locations that were preferred -- especially helps when there is no inherent
365+
* context and we must rely on the caller's intuition.
366+
*/
367+
public static final int MATCHCONF_PREFERRED = 5;
368+
312369
private static boolean isShort(int matchLen) {
313370
return matchLen <= NonsenseFilter.GENERIC_ONE_WORD;
314371
}
@@ -380,7 +437,7 @@ public void assessConfidence(PlaceCandidate pc) {
380437
if (fc != null) {
381438
featWeight = fc.factor;
382439
}
383-
points = (int)((0.75 * points) + (0.25 * points * featWeight));
440+
points = (int) ((0.75 * points) + (0.25 * points * featWeight));
384441

385442
// Any of these may occur.
386443
//======================
@@ -457,6 +514,13 @@ public void assessConfidence(PlaceCandidate pc) {
457514
points += pc.getLength() - 4;
458515
}
459516

517+
if (pc.hasRule(PREF_COUNTRY)) {
518+
points += MATCHCONF_PREFERRED;
519+
}
520+
if (pc.hasRule(PREF_LOCATION)) {
521+
points += MATCHCONF_PREFERRED;
522+
}
523+
460524
pc.setConfidence(points);
461525
}
462526

src/main/java/org/opensextant/xlayer/server/TaggerResource.java

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,15 @@
22

33
import static org.apache.commons.lang3.StringUtils.isNotBlank;
44

5+
import java.util.ArrayList;
56
import java.util.HashSet;
7+
import java.util.Iterator;
8+
import java.util.List;
69
import java.util.Set;
710
import java.util.logging.Level;
811
import java.util.logging.Logger;
912

13+
import org.json.JSONArray;
1014
import org.json.JSONException;
1115
import org.json.JSONObject;
1216
import org.opensextant.data.TextInput;
@@ -138,6 +142,20 @@ protected void resetParameters(Parameters job) {
138142
job.tag_patterns = false;
139143
job.addOutputFormat("json");
140144
}
145+
146+
/**
147+
*
148+
* @param a JSONArray
149+
* @return
150+
*/
151+
protected List<String> fromArray(JSONArray a){
152+
ArrayList<String> strings = new ArrayList<>();
153+
Iterator<Object> iter = a.iterator();
154+
while (iter.hasNext()) {
155+
strings.add((String)iter.next());
156+
}
157+
return strings;
158+
}
141159

142160
/**
143161
*
@@ -181,7 +199,14 @@ protected Parameters fromRequest(JSONObject inputs) throws JSONException {
181199
job.tag_lowercase = opts.contains("lowercase");
182200
job.resolve_localities = opts.contains("revgeo") || opts.contains("resolve_localities");
183201
}
184-
202+
//
203+
// Geographic filters
204+
if (inputs.has("preferred_countries")) {
205+
job.preferredGeography.put("countries", fromArray(inputs.getJSONArray("preferred_countries")));
206+
}
207+
if (inputs.has("preferred_locations")) {
208+
job.preferredGeography.put("geohashes", fromArray(inputs.getJSONArray("preferred_locations")));
209+
}
185210
if (job.clean_input || job.tag_lowercase) {
186211
job.isdefault = false;
187212
}

src/main/java/org/opensextant/xlayer/server/xgeo/XponentsGeotagger.java

Lines changed: 30 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public XponentsGeotagger() {
3636
}
3737

3838
/**
39-
* get Xponents Exxtractor object from global attributes.
39+
* get Xponents Exxtractor object from global attributes.
4040
*/
4141
public Extractor getExtractor(String xid) {
4242
Object X = this.getApplication().getContext().getAttributes().get(xid);
@@ -60,27 +60,22 @@ public Extractor getExtractor(String xid) {
6060
}
6161

6262
/**
63-
* Contract:
64-
* docid optional; 'text' | 'doc-list' required.
65-
* command: cmd=ping sends back a simple response
66-
*
67-
* text = UTF-8 encoded text
68-
* docid = user's provided document ID
69-
* doc-list = An array of text
70-
*
71-
* cmd=ping = report status.
72-
*
73-
* Where json-array contains { docs=[ {docid='A', text='...'}, {docid='B', text='...',...] }
74-
* The entire array must be parsable in memory as a single, traversible JSON object.
75-
* We make no assumption about one-JSON object per line or anything about line-endings as separators.
76-
*
77-
*
78-
* @param params
79-
* the params
80-
* @return the representation
81-
* @throws JSONException
82-
* the JSON exception
83-
*/
63+
* Contract: docid optional; 'text' | 'doc-list' required. command: cmd=ping sends back a simple
64+
* response
65+
*
66+
* text = UTF-8 encoded text docid = user's provided document ID doc-list = An array of text
67+
*
68+
* cmd=ping = report status.
69+
*
70+
* Where json-array contains { docs=[ {docid='A', text='...'}, {docid='B', text='...',...] } The
71+
* entire array must be parsable in memory as a single, traversible JSON object. We make no
72+
* assumption about one-JSON object per line or anything about line-endings as separators.
73+
*
74+
*
75+
* @param params JSON parameters per REST API: docid, text, lang, features, options, and preferred_*
76+
* @return the representation
77+
* @throws JSONException the JSON exception
78+
*/
8479
@Post("application/json;charset=utf-8")
8580
public Representation processForm(JsonRepresentation params) throws JSONException {
8681
org.json.JSONObject json = params.getJsonObject();
@@ -100,16 +95,14 @@ public Representation processForm(JsonRepresentation params) throws JSONExceptio
10095
}
10196

10297
/**
103-
* HTTP GET -- vanilla. Do not use in production, unless you have really small data packages.
104-
* This is useful for testing. Partial contract:
105-
*
106-
* miscellany: 'cmd' = 'ping' |... other commands.
107-
* processing: 'docid' = ?, 'text' = ?
108-
*
109-
* @param params
110-
* the params
111-
* @return the representation
112-
*/
98+
* HTTP GET -- vanilla. Do not use in production, unless you have really small data packages. This
99+
* is useful for testing. Partial contract:
100+
*
101+
* miscellany: 'cmd' = 'ping' |... other commands. processing: 'docid' = ?, 'text' = ?
102+
*
103+
* @param params JSON parameters. see process()
104+
* @return the representation
105+
*/
113106
@Get
114107
public Representation processGet(Representation params) {
115108
Form inputs = getRequest().getResourceRef().getQueryAsForm();
@@ -140,10 +133,8 @@ public Representation process(TextInput input, Parameters jobParams) {
140133
try {
141134
if (prodMode) {
142135
PlaceGeocoder xgeo = (PlaceGeocoder) getExtractor("xgeo");
143-
xgeo.setAllowLowerCase(jobParams.tag_lowercase);
136+
List<TextMatch> matches = xgeo.extract(input, jobParams);
144137

145-
List<TextMatch> matches = xgeo.extract(input);
146-
147138
if (jobParams.tag_patterns) {
148139
XTemporal xt = (XTemporal) getExtractor("xtemp");
149140
matches.addAll(xt.extract(input));
@@ -169,7 +160,7 @@ public Representation process(TextInput input, Parameters jobParams) {
169160
/**
170161
* Format matches as JSON
171162
*
172-
* @param matches items to format
163+
* @param matches items to format
173164
* @param jobParams parameters
174165
* @return formatted json
175166
* @throws JSONException on format error
@@ -184,9 +175,9 @@ private Representation format(List<TextMatch> matches, Parameters jobParams) thr
184175
}
185176

186177
/**
187-
* @param params parameters
188-
* @param variousMatches matches to filter
189-
*/
178+
* @param params parameters
179+
* @param variousMatches matches to filter
180+
*/
190181
public void filter(List<TextMatch> variousMatches, Parameters params) {
191182
// Determine what looks useful. Filter out things not worth
192183
// saving at all in data store.

0 commit comments

Comments
 (0)