Created
January 8, 2019 21:08
-
-
Save cristhianleonli/01c2961e4141a66279421fedd37e7699 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Foundation | |
class EntitiesFinder { | |
/** | |
Find if in the given `string` exists any term from the `array` | |
- Parameter array: set of strings to find with in the string | |
- Parameter string: target text where to find the entities | |
- Returns: array of ranges with every found entity, Range => (location, length) | |
*/ | |
class func findEntitiesRanges(for array: Set<String>, in string: String) -> [NSRange] { | |
let words = string.lowercased().split(separator: " ").map { String($0) } | |
var response = [EntityNode]() | |
var previous = EntityNode() | |
for (i, word) in words.enumerated() { | |
var node = EntityNode(word: word) | |
if i == 0 { | |
node.allContent = word | |
node.isCompound = false | |
let isExact = node.stopCharacter ? node.withoutLast : node.allContent | |
node.isExact = array.firstIndex(of: isExact) != nil | |
previous = node | |
continue | |
} | |
// by default, the index is the same as previous's + previous length | |
node.startIndex = previous.startIndex + previous.allContent.count + 1 | |
// exists the same word in entities | |
let wordToFind = node.stopCharacter ? node.withoutLast : node.content | |
node.isExact = array.firstIndex(of: wordToFind) != nil | |
let temp = previous.allContent + " " + (node.stopCharacter ? node.withoutLast : node.content) | |
if !array.filter({ $0.starts(with: temp) }).isEmpty { | |
// is compound entity | |
node.isCompound = true | |
node.allContent = "\(previous.allContent) \(node.content)" | |
node.startIndex = previous.startIndex | |
previous = node | |
} else { | |
// non-compund entity | |
if previous.isCompound || previous.isExact { | |
response.append(previous) | |
} | |
node.allContent = word | |
previous = node | |
} | |
} | |
// last word | |
if previous.isCompound || previous.isExact { | |
response.append(previous) | |
} | |
return response.map { node in | |
let count = node.allContent.count - (node.stopCharacter ? 1 : 0) | |
return NSRange(location: node.startIndex, length: count) | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Foundation | |
struct EntityNode { | |
var content = "" | |
var startIndex = 0 | |
var isCompound = false | |
var isExact = false | |
var allContent = "" | |
/// checks if the string contains a comma in the last position | |
var stopCharacter: Bool { | |
let range = NSRange(location: content.count - 1, length: 1) | |
let regex = try! NSRegularExpression(pattern: "[a-z]") | |
return regex.firstMatch(in: content, options: [], range: range) == nil | |
} | |
/// returns the same string but removing the last character | |
var withoutLast: String { | |
return String(content[content.startIndex..<content.index(content.startIndex, offsetBy: content.count - 1)]) | |
} | |
init() { | |
} | |
init(word: String) { | |
self.content = word | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment