Skip to content

Commit

Permalink
PIG-2514: REGEX_EXTRACT not returning correct group with non greedy r…
Browse files Browse the repository at this point in the history
…egex

git-svn-id: https://svn.apache.org/repos/asf/pig/trunk@1296005 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
Jianyong Dai committed Mar 2, 2012
1 parent b625274 commit 9e80f9e
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 11 deletions.
2 changes: 2 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ OPTIMIZATIONS

BUG FIXES

PIG-2514: REGEX_EXTRACT not returning correct group with non greedy regex (romainr via daijy)

PIG-2532: Registered classes fail deserialization in frontend (traviscrawford via julien)

PIG-2549: org.apache.pig.piggybank.storage.avro - Broken documentation link for AvroStorage (chrisas via daijy)
Expand Down
29 changes: 23 additions & 6 deletions src/org/apache/pig/builtin/REGEX_EXTRACT.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,24 @@
* <dd><code>match_index</code>-<code>index of the group to extract</code>.</dd>
* <dt><b>Output:</b></dt>
* <dd><code>extracted group, if fail, return null</code>.</dd>
* <dt><b>Matching strategy:</b></dt>
* <dd>Try to only match the first sequence by using {@link Matcher#find()} instead of
* {@link Matcher#matches()} (default useMatches=false).</dd>
* <dd><code>DEFINE NON_GREEDY_EXTRACT REGEX_EXTRACT(true);</code></dd>
* </dl>
*/

public class REGEX_EXTRACT extends EvalFunc<String> {
String mExpression = null;
Pattern mPattern = null;
Pattern mPattern = null;
boolean mUseMatches = false;

public REGEX_EXTRACT() {}

public REGEX_EXTRACT(boolean useMatches) {
this.mUseMatches = useMatches;
}

@Override
public Schema outputSchema(Schema input) {
try {
Expand All @@ -56,6 +68,7 @@ public Schema outputSchema(Schema input) {
}
}

@Override
public String exec(Tuple input) throws IOException {
if (input.size()!=3) {
String msg = "RegexExtract : Only 3 parameters are allowed.";
Expand All @@ -81,16 +94,20 @@ public String exec(Tuple input) throws IOException {
throw new IOException(msg);
}
int mIndex = (Integer)input.get(2);

Matcher m = mPattern.matcher((String)input.get(0));
if (m.find()&&m.groupCount()>=mIndex)

if (!mUseMatches&&m.find()||mUseMatches&&m.matches())
{
return m.group(mIndex);
if (m.groupCount()>=mIndex)
{
return m.group(mIndex);
}
}
warn("RegexExtract : Cannot extract group for input "+input.get(0), PigWarning.UDF_WARNING_1);
return null;
}

@Override
public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
List<FuncSpec> funcList = new ArrayList<FuncSpec>();
Expand All @@ -100,5 +117,5 @@ public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
s.add(new Schema.FieldSchema(null, DataType.INTEGER));
funcList.add(new FuncSpec(this.getClass().getName(), s));
return funcList;
}
}
}
20 changes: 15 additions & 5 deletions src/org/apache/pig/builtin/REGEX_EXTRACT_ALL.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,22 @@
* <dd><code>regex</code>-<code>regular expression</code>.</dd>
* <dt><b>Output:</b></dt>
* <dd><code>A tuple of matched strings</code>.</dd>
* <dt><b>Matching strategy:</b></dt>
* <dd>Trying to match the entire input by using {@link Matcher#matches()} instead of
* {@link Matcher#find()} (default useMatches=true).</dd>
* <dd><code>DEFINE GREEDY_EXTRACT REGEX_EXTRACT(false);</code></dd>
* </dl>
*/

public class REGEX_EXTRACT_ALL extends EvalFunc<Tuple> {

private static TupleFactory tupleFactory = TupleFactory.getInstance();
boolean mUseMatches = true;

public REGEX_EXTRACT_ALL() {}

public REGEX_EXTRACT_ALL(boolean useMatches) {
this.mUseMatches = useMatches;
}

@Override
public Tuple exec(Tuple input) throws IOException {
Expand All @@ -72,7 +82,7 @@ public Tuple exec(Tuple input) throws IOException {
}

Matcher m = mPattern.matcher((String)input.get(0));
if (!m.matches()) {
if (mUseMatches&&!m.matches()||!mUseMatches&&!m.find()) {
return null;
}
Tuple result = tupleFactory.newTuple(m.groupCount());
Expand All @@ -83,11 +93,11 @@ public Tuple exec(Tuple input) throws IOException {
}

String mExpression = null;
Pattern mPattern = null;
Pattern mPattern = null;
@Override
public Schema outputSchema(Schema input) {
try {
return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),
return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),
DataType.TUPLE));
} catch (Exception e) {
return null;
Expand All @@ -102,6 +112,6 @@ public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
s.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
funcList.add(new FuncSpec(this.getClass().getName(), s));
return funcList;
}
}
}

35 changes: 35 additions & 0 deletions test/org/apache/pig/test/TestBuiltin.java
Original file line number Diff line number Diff line change
Expand Up @@ -1520,6 +1520,11 @@ public void testStringFuncs() throws Exception {
t3.set(0, null);
t3.set(1, "^\\/search\\/iy\\/(.*?)\\/.*");
t3.set(2, 2);

Tuple t4 = tupleFactory.newTuple(3);
t4.set(0,"this is a match");
t4.set(1, "this is a (.+?)");
t4.set(2, 1);

REGEX_EXTRACT func = new REGEX_EXTRACT();
String r = func.exec(t1);
Expand All @@ -1528,6 +1533,12 @@ public void testStringFuncs() throws Exception {
assertTrue(r==null);
r = func.exec(t3);
assertTrue(r==null);
r = func.exec(t4);
assertEquals("m", r);

func = new REGEX_EXTRACT(true);
r = func.exec(t4);
assertEquals("match", r);

String matchRegex = "^(.+)\\b\\s+is a\\s+\\b(.+)$";
TupleFactory tupleFactory = TupleFactory.getInstance();
Expand All @@ -1554,6 +1565,30 @@ public void testStringFuncs() throws Exception {

re = funce.exec(te3);
assertTrue(re==null);

matchRegex = "(.+?)(.+?)";
tupleFactory = TupleFactory.getInstance();
te1 = tupleFactory.newTuple(2);
te1.set(0,"this is a match");
te1.set(1, matchRegex);

funce = new REGEX_EXTRACT_ALL();
re = funce.exec(te1);
assertEquals(re.size(), 2);
assertEquals("t", re.get(0));
assertEquals("his is a match", re.get(1));

funce = new REGEX_EXTRACT_ALL(false);
re = funce.exec(te1);
assertEquals(re.size(), 2);
assertEquals("t", re.get(0));
assertEquals("h", re.get(1));

re = funce.exec(te2);
assertTrue(re==null);

re = funce.exec(te3);
assertTrue(re==null);
}

@Test
Expand Down

0 comments on commit 9e80f9e

Please sign in to comment.