|  | // Copyright (C) 2016 The Android Open Source Project | 
|  | // | 
|  | // Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | // you may not use this file except in compliance with the License. | 
|  | // You may obtain a copy of the License at | 
|  | // | 
|  | // http://www.apache.org/licenses/LICENSE-2.0 | 
|  | // | 
|  | // Unless required by applicable law or agreed to in writing, software | 
|  | // distributed under the License is distributed on an "AS IS" BASIS, | 
|  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | // See the License for the specific language governing permissions and | 
|  | // limitations under the License. | 
|  |  | 
|  | package com.google.gerrit.mail; | 
|  |  | 
|  | import com.google.common.base.Strings; | 
|  | import com.google.common.collect.ImmutableSet; | 
|  | import com.google.common.collect.Iterators; | 
|  | import com.google.common.collect.PeekingIterator; | 
|  | import com.google.gerrit.entities.Comment; | 
|  | import java.util.ArrayList; | 
|  | import java.util.Collection; | 
|  | import java.util.List; | 
|  | import org.jsoup.Jsoup; | 
|  | import org.jsoup.nodes.Document; | 
|  | import org.jsoup.nodes.Element; | 
|  |  | 
|  | /** Provides functionality for parsing the HTML part of a {@link MailMessage}. */ | 
|  | public class HtmlParser { | 
|  |  | 
|  | private static final ImmutableSet<String> MAIL_PROVIDER_EXTRAS = | 
|  | ImmutableSet.of( | 
|  | "gmail_extra", // "On 01/01/2017 User<user@gmail.com> wrote:" | 
|  | "gmail_quote" // Used for quoting original content | 
|  | ); | 
|  |  | 
|  | private static final ImmutableSet<String> WHITELISTED_HTML_TAGS = | 
|  | ImmutableSet.of( | 
|  | "div", // Most user-typed comments are contained in a <div> tag | 
|  | "a", // We allow links to be contained in a comment | 
|  | "font" // Some email clients like nesting input in a new font tag | 
|  | ); | 
|  |  | 
|  | private HtmlParser() {} | 
|  |  | 
|  | /** | 
|  | * Parses comments from html email. | 
|  | * | 
|  | * <p>This parser goes though all html elements in the email and checks for matching patterns. It | 
|  | * keeps track of the last file and comments it encountered to know in which context a parsed | 
|  | * comment belongs. It uses the href attributes of <a> tags to identify comments sent out by | 
|  | * Gerrit as these are generally more reliable then the text captions. | 
|  | * | 
|  | * @param email the message as received from the email service | 
|  | * @param comments a specific set of comments as sent out in the original notification email. | 
|  | *     Comments are expected to be in the same order as they were sent out to in the email. | 
|  | * @param changeUrl canonical change URL that points to the change on this Gerrit instance. | 
|  | *     Example: https://go-review.googlesource.com/#/c/91570 | 
|  | * @return list of MailComments parsed from the html part of the email | 
|  | */ | 
|  | public static List<MailComment> parse( | 
|  | MailMessage email, Collection<Comment> comments, String changeUrl) { | 
|  | // TODO(hiesel) Add support for Gmail Mobile | 
|  | // TODO(hiesel) Add tests for other popular email clients | 
|  |  | 
|  | // This parser goes though all html elements in the email and checks for | 
|  | // matching patterns. It keeps track of the last file and comments it | 
|  | // encountered to know in which context a parsed comment belongs. | 
|  | // It uses the href attributes of <a> tags to identify comments sent out by | 
|  | // Gerrit as these are generally more reliable then the text captions. | 
|  | List<MailComment> parsedComments = new ArrayList<>(); | 
|  | Document d = Jsoup.parse(email.htmlContent()); | 
|  | PeekingIterator<Comment> iter = Iterators.peekingIterator(comments.iterator()); | 
|  |  | 
|  | String lastEncounteredFileName = null; | 
|  | Comment lastEncounteredComment = null; | 
|  | for (Element e : d.body().getAllElements()) { | 
|  | String elementName = e.tagName(); | 
|  | boolean isInBlockQuote = | 
|  | e.parents().stream() | 
|  | .anyMatch( | 
|  | p -> | 
|  | p.tagName().equals("blockquote") | 
|  | || MAIL_PROVIDER_EXTRAS.contains(p.className())); | 
|  |  | 
|  | if (elementName.equals("a")) { | 
|  | String href = e.attr("href"); | 
|  | // Check if there is still a next comment that could be contained in | 
|  | // this <a> tag | 
|  | if (!iter.hasNext()) { | 
|  | continue; | 
|  | } | 
|  | Comment perspectiveComment = iter.peek(); | 
|  | if (href.equals(ParserUtil.filePath(changeUrl, perspectiveComment))) { | 
|  | if (lastEncounteredFileName == null | 
|  | || !lastEncounteredFileName.equals(perspectiveComment.key.filename)) { | 
|  | // Not a file-level comment, but users could have typed a comment | 
|  | // right after this file annotation to create a new file-level | 
|  | // comment. If this file has a file-level comment, we have already | 
|  | // set lastEncounteredComment to that file-level comment when we | 
|  | // encountered the file link and should not reset it now. | 
|  | lastEncounteredFileName = perspectiveComment.key.filename; | 
|  | lastEncounteredComment = null; | 
|  | } else if (perspectiveComment.lineNbr == 0) { | 
|  | // This was originally a file-level comment | 
|  | lastEncounteredComment = perspectiveComment; | 
|  | iter.next(); | 
|  | } | 
|  | continue; | 
|  | } else if (ParserUtil.isCommentUrl(href, changeUrl, perspectiveComment)) { | 
|  | // This is a regular inline comment | 
|  | lastEncounteredComment = perspectiveComment; | 
|  | iter.next(); | 
|  | continue; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (isInBlockQuote) { | 
|  | // There is no user-input in quoted text | 
|  | continue; | 
|  | } | 
|  | if (!WHITELISTED_HTML_TAGS.contains(elementName)) { | 
|  | // We only accept a set of whitelisted tags that can contain user input | 
|  | continue; | 
|  | } | 
|  | if (elementName.equals("a") && e.attr("href").startsWith("mailto:")) { | 
|  | // We don't accept mailto: links in general as they often appear in reply-to lines | 
|  | // (User<user@gmail.com> wrote: ...) | 
|  | continue; | 
|  | } | 
|  |  | 
|  | // This is a comment typed by the user | 
|  | // Replace non-breaking spaces and trim string | 
|  | String content = e.ownText().replace('\u00a0', ' ').trim(); | 
|  | boolean isLink = elementName.equals("a"); | 
|  | if (!Strings.isNullOrEmpty(content)) { | 
|  | if (lastEncounteredComment == null && lastEncounteredFileName == null) { | 
|  | // Remove quotation line, email signature and | 
|  | // "Sent from my xyz device" | 
|  | content = ParserUtil.trimQuotation(content); | 
|  | // TODO(hiesel) Add more sanitizer | 
|  | if (!Strings.isNullOrEmpty(content)) { | 
|  | ParserUtil.appendOrAddNewComment( | 
|  | new MailComment( | 
|  | content, null, null, MailComment.CommentType.CHANGE_MESSAGE, isLink), | 
|  | parsedComments); | 
|  | } | 
|  | } else if (lastEncounteredComment == null) { | 
|  | ParserUtil.appendOrAddNewComment( | 
|  | new MailComment( | 
|  | content, | 
|  | lastEncounteredFileName, | 
|  | null, | 
|  | MailComment.CommentType.FILE_COMMENT, | 
|  | isLink), | 
|  | parsedComments); | 
|  | } else { | 
|  | ParserUtil.appendOrAddNewComment( | 
|  | new MailComment( | 
|  | content, | 
|  | null, | 
|  | lastEncounteredComment, | 
|  | MailComment.CommentType.INLINE_COMMENT, | 
|  | isLink), | 
|  | parsedComments); | 
|  | } | 
|  | } | 
|  | } | 
|  | return parsedComments; | 
|  | } | 
|  | } |