Since we can’t use any of the classes of java.util.regex the solution below only relies on the regex methods of String. In our editor the parsing is done continuously while the user types. Meaning it is possible that toMarkup already contains an anchor element that was replaced previously. Therefore, we must not replace that URL again or more general, we must not parse any URL already contained in an anchor tag. The last part of PATTERN accounts for that situation.
/** * Converts URLs into short form of url elements. * * The shorter form of the URL will contain the subdomain, domain and top domain. Protocol (http(s) or * ftp) and path will be removed. * * @author zubi * */ public class LinkParser implements RichTextHTMLParser { private static final String BEFORE_DOMAIN = "\\b((https?|ftp)://)"; private static final String PATH = "(:\\d+)?(/[-a-z0-9A-Z_:@&?=+,.!/~*'%#$]*)*"; private static final String PATTERN = BEFORE_DOMAIN + "?([a-z0-9](?:[-a-z0-9A-Z]*[a-z0-9])?\\.)+(com\\b|edu\\b|biz\\b|gov\\b|in(?:t|fo)\\b|mil\\b|net\\b|org\\b|[a-z][a-z]\\b)" + PATH + "(?!((?!(?:))"; @Override public String getHTML(String toMarkup) { String[] splits = toMarkup.split(PATTERN, 2); String result = toMarkup; while (splits.length > 1) { result = replaceUrl(result, splits); splits = result.split(PATTERN, 2); } if (result.matches(".+?" + PATTERN)) { int start = splits[0].length(); result = result.replaceAll(PATTERN, getLink(result.substring(start))); } return result; } private String replaceUrl(String text, String[] parts) { int start = parts[0].length(); int end = parts[0].length() + text.substring(parts[0].length()).lastIndexOf(parts[1]); String url = text.substring(start, end); return text.replaceFirst(PATTERN, getLink(url)); } private String getLink(String url) { if (!url.matches(BEFORE_DOMAIN + ".*")) { url = "http://" + url; } return "" + getDomain(url) + ""; } private String getDomain(String url) { String domain = url.replaceAll(BEFORE_DOMAIN, "").replaceAll(PATH, ""); return domain; } }
No comments:
Post a Comment