| // Copyright (C) 2016 The Android Open Source Project |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package com.google.gerrit.mail; |
| |
| import com.google.common.base.Strings; |
| import com.google.common.collect.ImmutableSet; |
| import com.google.common.collect.Iterators; |
| import com.google.common.collect.PeekingIterator; |
| import com.google.gerrit.entities.Comment; |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.List; |
| import org.jsoup.Jsoup; |
| import org.jsoup.nodes.Document; |
| import org.jsoup.nodes.Element; |
| |
| /** Provides functionality for parsing the HTML part of a {@link MailMessage}. */ |
| public class HtmlParser { |
| |
| private static final ImmutableSet<String> MAIL_PROVIDER_EXTRAS = |
| ImmutableSet.of( |
| "gmail_extra", // "On 01/01/2017 User<user@gmail.com> wrote:" |
| "gmail_quote" // Used for quoting original content |
| ); |
| |
| private static final ImmutableSet<String> WHITELISTED_HTML_TAGS = |
| ImmutableSet.of( |
| "div", // Most user-typed comments are contained in a <div> tag |
| "a", // We allow links to be contained in a comment |
| "font" // Some email clients like nesting input in a new font tag |
| ); |
| |
| private HtmlParser() {} |
| |
| /** |
| * Parses comments from html email. |
| * |
| * <p>This parser goes though all html elements in the email and checks for matching patterns. It |
| * keeps track of the last file and comments it encountered to know in which context a parsed |
| * comment belongs. It uses the href attributes of <a> tags to identify comments sent out by |
| * Gerrit as these are generally more reliable then the text captions. |
| * |
| * @param email the message as received from the email service |
| * @param comments a specific set of comments as sent out in the original notification email. |
| * Comments are expected to be in the same order as they were sent out to in the email. |
| * @param changeUrl canonical change URL that points to the change on this Gerrit instance. |
| * Example: https://go-review.googlesource.com/#/c/91570 |
| * @return list of MailComments parsed from the html part of the email |
| */ |
| public static List<MailComment> parse( |
| MailMessage email, Collection<Comment> comments, String changeUrl) { |
| // TODO(hiesel) Add support for Gmail Mobile |
| // TODO(hiesel) Add tests for other popular email clients |
| |
| // This parser goes though all html elements in the email and checks for |
| // matching patterns. It keeps track of the last file and comments it |
| // encountered to know in which context a parsed comment belongs. |
| // It uses the href attributes of <a> tags to identify comments sent out by |
| // Gerrit as these are generally more reliable then the text captions. |
| List<MailComment> parsedComments = new ArrayList<>(); |
| Document d = Jsoup.parse(email.htmlContent()); |
| PeekingIterator<Comment> iter = Iterators.peekingIterator(comments.iterator()); |
| |
| String lastEncounteredFileName = null; |
| Comment lastEncounteredComment = null; |
| for (Element e : d.body().getAllElements()) { |
| String elementName = e.tagName(); |
| boolean isInBlockQuote = |
| e.parents().stream() |
| .anyMatch( |
| p -> |
| p.tagName().equals("blockquote") |
| || MAIL_PROVIDER_EXTRAS.contains(p.className())); |
| |
| if (elementName.equals("a")) { |
| String href = e.attr("href"); |
| // Check if there is still a next comment that could be contained in |
| // this <a> tag |
| if (!iter.hasNext()) { |
| continue; |
| } |
| Comment perspectiveComment = iter.peek(); |
| if (href.equals(ParserUtil.filePath(changeUrl, perspectiveComment))) { |
| if (lastEncounteredFileName == null |
| || !lastEncounteredFileName.equals(perspectiveComment.key.filename)) { |
| // Not a file-level comment, but users could have typed a comment |
| // right after this file annotation to create a new file-level |
| // comment. If this file has a file-level comment, we have already |
| // set lastEncounteredComment to that file-level comment when we |
| // encountered the file link and should not reset it now. |
| lastEncounteredFileName = perspectiveComment.key.filename; |
| lastEncounteredComment = null; |
| } else if (perspectiveComment.lineNbr == 0) { |
| // This was originally a file-level comment |
| lastEncounteredComment = perspectiveComment; |
| iter.next(); |
| } |
| continue; |
| } else if (ParserUtil.isCommentUrl(href, changeUrl, perspectiveComment)) { |
| // This is a regular inline comment |
| lastEncounteredComment = perspectiveComment; |
| iter.next(); |
| continue; |
| } |
| } |
| |
| if (isInBlockQuote) { |
| // There is no user-input in quoted text |
| continue; |
| } |
| if (!WHITELISTED_HTML_TAGS.contains(elementName)) { |
| // We only accept a set of whitelisted tags that can contain user input |
| continue; |
| } |
| if (elementName.equals("a") && e.attr("href").startsWith("mailto:")) { |
| // We don't accept mailto: links in general as they often appear in reply-to lines |
| // (User<user@gmail.com> wrote: ...) |
| continue; |
| } |
| |
| // This is a comment typed by the user |
| // Replace non-breaking spaces and trim string |
| String content = e.ownText().replace('\u00a0', ' ').trim(); |
| boolean isLink = elementName.equals("a"); |
| if (!Strings.isNullOrEmpty(content)) { |
| if (lastEncounteredComment == null && lastEncounteredFileName == null) { |
| // Remove quotation line, email signature and |
| // "Sent from my xyz device" |
| content = ParserUtil.trimQuotation(content); |
| // TODO(hiesel) Add more sanitizer |
| if (!Strings.isNullOrEmpty(content)) { |
| ParserUtil.appendOrAddNewComment( |
| new MailComment( |
| content, null, null, MailComment.CommentType.CHANGE_MESSAGE, isLink), |
| parsedComments); |
| } |
| } else if (lastEncounteredComment == null) { |
| ParserUtil.appendOrAddNewComment( |
| new MailComment( |
| content, |
| lastEncounteredFileName, |
| null, |
| MailComment.CommentType.FILE_COMMENT, |
| isLink), |
| parsedComments); |
| } else { |
| ParserUtil.appendOrAddNewComment( |
| new MailComment( |
| content, |
| null, |
| lastEncounteredComment, |
| MailComment.CommentType.INLINE_COMMENT, |
| isLink), |
| parsedComments); |
| } |
| } |
| } |
| return parsedComments; |
| } |
| } |