From 36c7d17d3b69fe30a8e2e836d58f02169a02c03f Mon Sep 17 00:00:00 2001 From: Alanscut Date: Wed, 28 Oct 2020 11:59:25 +0800 Subject: [PATCH] add IPRecognition --- .../recognition/impl/IDCardRecognition.java | 19 ++++++++++-- .../ansj/recognition/impl/IPRecognition.java | 31 +++++++++++++++++++ .../impl/IDCardRecognitionTest.java | 2 ++ .../recognition/impl/IPRecognitionTest.java | 20 ++++++++++++ 4 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 src/main/java/org/ansj/recognition/impl/IPRecognition.java create mode 100644 src/test/java/org/ansj/recognition/impl/IPRecognitionTest.java diff --git a/src/main/java/org/ansj/recognition/impl/IDCardRecognition.java b/src/main/java/org/ansj/recognition/impl/IDCardRecognition.java index 4fa933ff..f2360f58 100644 --- a/src/main/java/org/ansj/recognition/impl/IDCardRecognition.java +++ b/src/main/java/org/ansj/recognition/impl/IDCardRecognition.java @@ -20,23 +20,36 @@ public class IDCardRecognition implements Recognition { */ private static final long serialVersionUID = -32133440735240290L; private static final Nature ID_CARD_NATURE = new Nature("idcard"); + private static final String REGEX_ID_NO_18 = "\\d{6}(18|19|([23]\\d))\\d{2}((0[1-9])|(10|11|12))(([0-2][1-9])|10|20|30|31)\\d{3}[0-9Xx]"; @Override public void recognition(Result result) { List terms = result.getTerms() ; - + for (Term term : terms) { if ("m".equals(term.getNatureStr())) { - if (term.getName().length() == 18) { + if (term.getName().length() == 18 && term.getName().matches(REGEX_ID_NO_18)) { term.setNature(ID_CARD_NATURE); } else if (term.getName().length() == 17) { Term to = term.to(); - if ("x".equals(to.getName())) { + if ("x".equals(to.getName()) && (term.getName() + to.getName()).matches(REGEX_ID_NO_18)) { term.merage(to); to.setName(null); term.setNature(ID_CARD_NATURE); + } else if (to.getName().startsWith("x") || to.getName().startsWith("X")) { + String start = to.getName().substring(0, 1); + if ((term.getName() + start).matches(REGEX_ID_NO_18)) { + String substring = to.getName().substring(1); + String real_substring = to.getRealName().substring(1); + String real_start = to.getRealName().substring(0,1); + term.setRealName(term.getRealName() + real_start); + term.setName(term.getName() + start); + to.setRealName(real_substring); + to.setName(substring); + term.setNature(ID_CARD_NATURE); + } } } diff --git a/src/main/java/org/ansj/recognition/impl/IPRecognition.java b/src/main/java/org/ansj/recognition/impl/IPRecognition.java new file mode 100644 index 00000000..c233858b --- /dev/null +++ b/src/main/java/org/ansj/recognition/impl/IPRecognition.java @@ -0,0 +1,31 @@ +package org.ansj.recognition.impl; + +import org.ansj.app.extracting.Extracting; +import org.ansj.app.extracting.exception.RuleFormatException; +import org.ansj.domain.TermNature; +import org.ansj.domain.TermNatures; + +public class IPRecognition extends ExtractingRecognition { + + private static final long serialVersionUID = 1L; + + private static final TermNatures IP_T_N = new TermNatures(new TermNature("ip", 1)); + + private static final Extracting EXTRACTING = new Extracting(); + + private static final String REGEX_IPv4 = "(:m)[25[0-5]|][2[0-4]\\\\d|][1\\\\d{2}|][[0-9]\\\\d][\\\\d](\\.)" + + "(:m)[25[0-5]|][2[0-4]\\\\d|][1\\\\d{2}|][[0-9]\\\\d][\\\\d](\\.)(:m)[25[0-5]|][2[0-4]\\\\d|][1\\\\d{2}|][[0-9]\\\\d][\\\\d](\\.)" + + "(:m)[25[0-5]|][2[0-4]\\\\d|][1\\\\d{2}|][[0-9]\\\\d][\\\\d]"; + + static { + try { + EXTRACTING.addRuleStr(REGEX_IPv4); + } catch (RuleFormatException e) { + e.printStackTrace(); + } + } + + public IPRecognition() { + super(EXTRACTING, IP_T_N); + } +} diff --git a/src/test/java/org/ansj/recognition/impl/IDCardRecognitionTest.java b/src/test/java/org/ansj/recognition/impl/IDCardRecognitionTest.java index abbf0775..09bd4ff5 100644 --- a/src/test/java/org/ansj/recognition/impl/IDCardRecognitionTest.java +++ b/src/test/java/org/ansj/recognition/impl/IDCardRecognitionTest.java @@ -15,8 +15,10 @@ public class IDCardRecognitionTest { @Test public void test() { Result result = ToAnalysis.parse("我吃了一个西瓜,我今年25岁。13282619771220503X这里有一万个东西,我的身份证号码是130722198506280057h"); + Result result2 = ToAnalysis.parse("132826197713205030这,身份证号码是13072219850628005xx"); System.out.println(result.recognition(new IDCardRecognition())); + System.out.println(result2.recognition(new IDCardRecognition())); } } diff --git a/src/test/java/org/ansj/recognition/impl/IPRecognitionTest.java b/src/test/java/org/ansj/recognition/impl/IPRecognitionTest.java new file mode 100644 index 00000000..69cb801f --- /dev/null +++ b/src/test/java/org/ansj/recognition/impl/IPRecognitionTest.java @@ -0,0 +1,20 @@ +package org.ansj.recognition.impl; + +import org.ansj.domain.Result; +import org.ansj.splitWord.analysis.ToAnalysis; +import org.junit.Assert; +import org.junit.Test; + +public class IPRecognitionTest { + + @Test + public void recognition() throws Exception { + + Result recognition = null ; + recognition = ToAnalysis.parse("192.168.1.1, 1.1.1.1, 255.254.251.256, 0.0.0.0").recognition(new IPRecognition()); + + System.out.println(recognition); + Assert.assertEquals(recognition.get(0).getName(), ("192.168.1.1")); + } + +} \ No newline at end of file