001/**
002 * Copyright 2014 Tampere University of Technology, Pori Department
003 * 
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 * 
008 *   http://www.apache.org/licenses/LICENSE-2.0
009 * 
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package core.tut.pori.utils;
017
018import java.io.IOException;
019import java.io.InputStream;
020import java.net.URL;
021import java.util.EnumSet;
022
023import javax.xml.bind.annotation.XmlEnum;
024import javax.xml.bind.annotation.XmlEnumValue;
025
026import org.apache.commons.codec.DecoderException;
027import org.apache.commons.codec.binary.Hex;
028import org.apache.commons.lang3.StringUtils;
029import org.apache.log4j.Logger;
030
031/**
032 * This class can be used to validate whether content given as an URL link contains image content.
033 * 
034 * The validation of the URL is done by downloading the first few bytes of the content and comparing the retrieved bytes to the list of known magic numbers.
035 * 
036 * The supported video formats for this validator are:
037 * <ul>
038 *  <li>GIF</li>
039 *  <li>JPEG</li>
040 *  <li>PNG</li>
041 * </ul>
042 * 
043 * The supported photo formats for this validator are:
044 * <ul>
045 *  <li>AVI</li>
046 *  <li>Flash Video</li>
047 *  <li>MKV</li>
048 *  <li>MPEG Video</li>
049 *  <li>MOV</li>
050 * </ul>
051 * 
052 * Note that the magic number validation is a simplified operation and no end bytes for file types will ever be checked, whether they denote the file type or not.
053 * 
054 */
055public class MediaUrlValidator {
056  /* media types */
057  /** media type name for audio content */
058  public static final String MEDIA_TYPE_AUDIO = "AUDIO";
059  /** media type name for photo/image content */
060  public static final String MEDIA_TYPE_PHOTO = "PHOTO";
061  /** media type name for video content */
062  public static final String MEDIA_TYPE_VIDEO = "VIDEO";
063  /** media type name for unknown/unspecified content */
064  public static final String MEDIA_TYPE_UNKNOWN = "UNKNOWN";
065  private static final Logger LOGGER = Logger.getLogger(MediaUrlValidator.class);
066  /** avi actually is of format 52 49 46 46 xx xx xx xx 41 56 49 20 4C 49 53 54 where xx denote file size, the 52 49 46 46 also denote other formats such as wave files, but we'll simply ignore this minor issue */
067  private static final byte[] MAGIC_BYTE_AVI;
068  private static final byte[] MAGIC_BYTE_BMP;
069  private static final byte[] MAGIC_BYTE_FLASH_VIDEO;
070  private static final byte[] MAGIC_BYTE_GIF_1;
071  private static final byte[] MAGIC_BYTE_GIF_2;
072  /** note: a valid jpeg file should also end with ffd9, but this is will not be checked by the validator */
073  private static final byte[] MAGIC_BYTE_JPEG;
074  private static final byte[] MAGIC_BYTE_MKV;
075  /** mpeg file is actually 00 00 01 Bx where x is a number. Last byte "Bx" is ignored because Hex.decodeHex requires Even number of bytes */
076  private static final byte[] MAGIC_BYTE_MPEG_VIDEO;
077  private static final byte[] MAGIC_BYTE_MOV;
078  private static final byte[] MAGIC_BYTE_PNG;
079  static{
080    try {
081      MAGIC_BYTE_BMP = Hex.decodeHex("424d".toCharArray());
082      MAGIC_BYTE_GIF_1 = Hex.decodeHex("474946383961".toCharArray());
083      MAGIC_BYTE_GIF_2 = Hex.decodeHex("474946383761".toCharArray());
084      MAGIC_BYTE_JPEG = Hex.decodeHex("ffd8".toCharArray());
085      MAGIC_BYTE_PNG = Hex.decodeHex("89504e470d0a1a0a".toCharArray());
086      MAGIC_BYTE_MOV = Hex.decodeHex("000000146674797071742020".toCharArray());
087      MAGIC_BYTE_MPEG_VIDEO = Hex.decodeHex("000001".toCharArray());
088      MAGIC_BYTE_MKV = Hex.decodeHex("1a45dfa3934282886d6174726f736b61".toCharArray());
089      MAGIC_BYTE_FLASH_VIDEO = Hex.decodeHex("464c5601".toCharArray());
090      MAGIC_BYTE_AVI = Hex.decodeHex("52494646".toCharArray());
091    } catch (DecoderException ex) { // this should never happen
092      LOGGER.error(ex,ex);
093      throw new IllegalArgumentException(ex.getMessage());
094    }
095  }
096  private static final int BUFFER_SIZE = 20; // take first 20 bytes in to the buffer
097
098  /**
099   * Media type declaration.
100   */
101  @XmlEnum
102  public enum MediaType {
103    /** media type is unknown or unspecified */
104    @XmlEnumValue(value = MEDIA_TYPE_UNKNOWN)
105    UNKNOWN(0),
106    /** media is of photo/image content */
107    @XmlEnumValue(value = MEDIA_TYPE_PHOTO)
108    PHOTO(1),
109    /** media is of video content */
110    @XmlEnumValue(value = MEDIA_TYPE_VIDEO)
111    VIDEO(2),
112    /** media is of audio content */
113    @XmlEnumValue(value = MEDIA_TYPE_AUDIO)
114    AUDIO(3);
115    
116    private int _value;
117    
118    /**
119     * 
120     * @param value
121     */
122    private MediaType(int value){
123      _value = value;
124    }
125    
126    /**
127     * 
128     * @return the media type as integer
129     */
130    public int toInt(){
131      return _value;
132    }
133    
134    /**
135     * 
136     * @param value
137     * @return the value as MediaType
138     * @throws IllegalArgumentException on bad value
139     */
140    public static MediaType fromInt(int value) throws IllegalArgumentException {
141      for(MediaType mt : MediaType.values()){
142        if(mt._value == value){
143          return mt;
144        }
145      }
146      throw new IllegalArgumentException("Bad "+MediaType.class.toString()+" : "+value);
147    }
148    
149    /**
150     * 
151     * @param mediaTypes
152     * @return the passed media types converted to primitive int array or null if empty or null set was passed
153     */
154    public static int[] toInt(EnumSet<MediaType> mediaTypes){
155      if(mediaTypes == null || mediaTypes.isEmpty()){
156        return null;
157      }
158      int[] types = new int[mediaTypes.size()];
159      int index = -1;
160      for(MediaType t : mediaTypes){
161        types[++index] = t.toInt();
162      }
163      return types;
164    }
165  } // enum MediaType
166
167  /**
168   * 
169   * @param array must be at least as long as the comparator
170   * @param with
171   * @return true if the given array starts with the given with
172   */
173  private static boolean startsWith(byte[] array, byte[] with){
174    for(int i=0; i<with.length; ++i){
175      if(array[i] != with[i]){
176        return false;
177      }
178    }
179    return true;
180  }
181  
182  /**
183   * 
184   * @param url
185   * @return media type for the given URL
186   */
187  public MediaType validateUrl(String url){
188    if(StringUtils.isBlank(url)){
189      LOGGER.warn("Empty URL.");
190      return MediaType.UNKNOWN;
191    }
192    
193    LOGGER.debug("Validating URL: "+url); 
194    try { 
195      URL u = new URL(url);
196      try (InputStream input = u.openStream()){
197        byte[] bytes = new byte[BUFFER_SIZE];
198        if(input.read(bytes) < BUFFER_SIZE){
199          LOGGER.warn("Failed to read first "+BUFFER_SIZE+" bytes.");
200          return MediaType.UNKNOWN;
201        }
202        
203        if(startsWith(bytes, MAGIC_BYTE_BMP)){
204          LOGGER.debug("Detected a bmp file.");
205          return MediaType.PHOTO;
206        }else if(startsWith(bytes, MAGIC_BYTE_PNG)){
207          LOGGER.debug("Detected a png file.");
208          return MediaType.PHOTO;
209        }else if(startsWith(bytes, MAGIC_BYTE_JPEG)){
210          LOGGER.debug("Detected a jpeg file.");
211          return MediaType.PHOTO;
212        }else if(startsWith(bytes, MAGIC_BYTE_GIF_1) || startsWith(bytes, MAGIC_BYTE_GIF_2)){
213          LOGGER.debug("Detected a gif file.");
214          return MediaType.PHOTO;
215        }else if(startsWith(bytes, MAGIC_BYTE_AVI)){
216          LOGGER.debug("Detected an avi file.");
217          return MediaType.VIDEO;
218        }else if(startsWith(bytes, MAGIC_BYTE_FLASH_VIDEO)){
219          LOGGER.debug("Detected a flash file.");
220          return MediaType.VIDEO;
221        }else if(startsWith(bytes, MAGIC_BYTE_MKV)){
222          LOGGER.debug("Detected a mkv file.");
223          return MediaType.VIDEO;
224        }else if(startsWith(bytes, MAGIC_BYTE_MOV)){
225          LOGGER.debug("Detected a mov file.");
226          return MediaType.VIDEO;
227        }else if(startsWith(bytes, MAGIC_BYTE_MPEG_VIDEO)){
228          LOGGER.debug("Detected a mpg file.");
229          return MediaType.VIDEO;
230        }
231      }
232    } catch (IOException | IllegalArgumentException ex) { // java's URL connection randomly throws illegal argument exception on certain valid urls, so catch and ignore it   
233      LOGGER.warn("Failed to read URL: "+url);
234      LOGGER.debug(ex, ex);
235    }
236    return MediaType.UNKNOWN;
237  }
238}